github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/registry.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package jobs
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math/rand"
    18  	"os"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/base"
    23  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    24  	"github.com/cockroachdb/cockroach/pkg/keys"
    25  	"github.com/cockroachdb/cockroach/pkg/kv"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/security"
    28  	"github.com/cockroachdb/cockroach/pkg/settings"
    29  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/sem/builtins"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    35  	"github.com/cockroachdb/cockroach/pkg/util/envutil"
    36  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    37  	"github.com/cockroachdb/cockroach/pkg/util/log"
    38  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    40  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    41  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    43  	"github.com/cockroachdb/errors"
    44  	"github.com/cockroachdb/logtags"
    45  	opentracing "github.com/opentracing/opentracing-go"
    46  )
    47  
    48  const defaultLeniencySetting = 60 * time.Second
    49  
    50  // See https://github.com/cockroachdb/cockroach/issues/47892.
    51  const multiTenancyIssueNo = 47892
    52  
    53  var (
    54  	nodeLivenessLogLimiter = log.Every(5 * time.Second)
    55  	// LeniencySetting is the amount of time to defer any attempts to
    56  	// reschedule a job.  Visible for testing.
    57  	LeniencySetting = settings.RegisterDurationSetting(
    58  		"jobs.registry.leniency",
    59  		"the amount of time to defer any attempts to reschedule a job",
    60  		defaultLeniencySetting)
    61  	gcSetting = settings.RegisterDurationSetting(
    62  		"jobs.retention_time",
    63  		"the amount of time to retain records for completed jobs before",
    64  		time.Hour*24*14)
    65  )
    66  
    67  // Registry creates Jobs and manages their leases and cancelation.
    68  //
    69  // Job information is stored in the `system.jobs` table.  Each node will
    70  // poll this table and establish a lease on any claimed job. Registry
    71  // calculates its own liveness for a node based on the expiration time
    72  // of the underlying node-liveness lease.  This is because we want to
    73  // allow jobs assigned to temporarily non-live (i.e. saturated) nodes to
    74  // continue without being canceled.
    75  //
    76  // When a lease has been determined to be stale, a node may attempt to
    77  // claim the relevant job. Thus, a Registry must occasionally
    78  // re-validate its own leases to ensure that another node has not stolen
    79  // the work and cancel the local job if so.
    80  //
    81  // Prior versions of Registry used the node's epoch value to determine
    82  // whether or not a job should be stolen.  The current implementation
    83  // uses a time-based approach, where a node's last reported expiration
    84  // timestamp is used to calculate a liveness value for the purpose
    85  // of job scheduling.
    86  //
    87  // Mixed-version operation between epoch- and time-based nodes works
    88  // since we still publish epoch information in the leases for time-based
    89  // nodes.  From the perspective of a time-based node, an epoch-based
    90  // node simply behaves as though its leniency period is 0. Epoch-based
    91  // nodes will see time-based nodes delay the act of stealing a job.
    92  type Registry struct {
    93  	ac         log.AmbientContext
    94  	stopper    *stop.Stopper
    95  	nl         sqlbase.OptionalNodeLiveness
    96  	db         *kv.DB
    97  	ex         sqlutil.InternalExecutor
    98  	clock      *hlc.Clock
    99  	nodeID     *base.SQLIDContainer
   100  	settings   *cluster.Settings
   101  	planFn     planHookMaker
   102  	metrics    Metrics
   103  	adoptionCh chan struct{}
   104  
   105  	// sessionBoundInternalExecutorFactory provides a way for jobs to create
   106  	// internal executors. This is rarely needed, and usually job resumers should
   107  	// use the internal executor from the PlanHookState. The intended user of this
   108  	// interface is the schema change job resumer, which needs to set the
   109  	// tableCollectionModifier on the internal executor to different values in
   110  	// multiple concurrent queries. This situation is an exception to the internal
   111  	// executor generally being a stateless wrapper, and makes it impossible to
   112  	// reuse the same internal executor across all the queries (without
   113  	// refactoring to get rid of the tableCollectionModifier field, which we
   114  	// should do eventually).
   115  	//
   116  	// Note that, while this API is not ideal, internal executors are basically
   117  	// lightweight wrappers requiring no additional teardown. There's not much
   118  	// cost incurred in creating these.
   119  	//
   120  	// TODO (lucy): We should refactor and get rid of the tableCollectionModifier
   121  	// field. Modifying the TableCollection is basically a per-query operation
   122  	// and should be a per-query setting. #34304 is the issue for creating/
   123  	// improving this API.
   124  	sessionBoundInternalExecutorFactory sqlutil.SessionBoundInternalExecutorFactory
   125  
   126  	// if non-empty, indicates path to file that prevents any job adoptions.
   127  	preventAdoptionFile string
   128  
   129  	mu struct {
   130  		syncutil.Mutex
   131  		// epoch is present to support older nodes that are not using
   132  		// the timestamp-based approach to determine when to steal jobs.
   133  		// TODO: Remove this and deprecate Lease.Epoch proto field
   134  		epoch int64
   135  		// jobs holds a map from job id to its context cancel func. This should
   136  		// be populated with jobs that are currently being run (and owned) by
   137  		// this registry. Calling the func will cancel the context the job was
   138  		// started/resumed with. This should only be called by the registry when
   139  		// it is attempting to halt its own jobs due to liveness problems. Jobs
   140  		// are normally canceled on any node by the CANCEL JOB statement, which is
   141  		// propagated to jobs via the .Progressed call. This function should not be
   142  		// used to cancel a job in that way.
   143  		jobs map[int64]context.CancelFunc
   144  	}
   145  
   146  	TestingResumerCreationKnobs map[jobspb.Type]func(Resumer) Resumer
   147  }
   148  
   149  // planHookMaker is a wrapper around sql.NewInternalPlanner. It returns an
   150  // *sql.planner as an interface{} due to package dependency cycles. It should
   151  // be cast to that type in the sql package when it is used. Returns a cleanup
   152  // function that must be called once the caller is done with the planner.
   153  //
   154  // TODO(mjibson): Can we do something to avoid passing an interface{} here
   155  // that must be type casted in a Resumer? It cannot be done here because
   156  // PlanHookState lives in the sql package, which would create a dependency
   157  // cycle if listed here. Furthermore, moving PlanHookState into a common
   158  // subpackage like sqlbase is difficult because of the amount of sql-only
   159  // stuff that PlanHookState exports. One other choice is to merge this package
   160  // back into the sql package. There's maybe a better way that I'm unaware of.
   161  type planHookMaker func(opName, user string) (interface{}, func())
   162  
   163  // PreventAdoptionFile is the name of the file which, if present in the first
   164  // on-disk store, will prevent the adoption of background jobs by that node.
   165  const PreventAdoptionFile = "DISABLE_STARTING_BACKGROUND_JOBS"
   166  
   167  // MakeRegistry creates a new Registry. planFn is a wrapper around
   168  // sql.newInternalPlanner. It returns a sql.PlanHookState, but must be
   169  // coerced into that in the Resumer functions.
   170  func MakeRegistry(
   171  	ac log.AmbientContext,
   172  	stopper *stop.Stopper,
   173  	clock *hlc.Clock,
   174  	nl sqlbase.OptionalNodeLiveness,
   175  	db *kv.DB,
   176  	ex sqlutil.InternalExecutor,
   177  	nodeID *base.SQLIDContainer,
   178  	settings *cluster.Settings,
   179  	histogramWindowInterval time.Duration,
   180  	planFn planHookMaker,
   181  	preventAdoptionFile string,
   182  ) *Registry {
   183  	r := &Registry{
   184  		ac:                  ac,
   185  		stopper:             stopper,
   186  		clock:               clock,
   187  		nl:                  nl,
   188  		db:                  db,
   189  		ex:                  ex,
   190  		nodeID:              nodeID,
   191  		settings:            settings,
   192  		planFn:              planFn,
   193  		preventAdoptionFile: preventAdoptionFile,
   194  		adoptionCh:          make(chan struct{}),
   195  	}
   196  	r.mu.epoch = 1
   197  	r.mu.jobs = make(map[int64]context.CancelFunc)
   198  	r.metrics.InitHooks(histogramWindowInterval)
   199  	return r
   200  }
   201  
   202  // SetSessionBoundInternalExecutorFactory sets the
   203  // SessionBoundInternalExecutorFactory that will be used by the job registry
   204  // executor. We expose this separately from the constructor to avoid a circular
   205  // dependency.
   206  func (r *Registry) SetSessionBoundInternalExecutorFactory(
   207  	factory sqlutil.SessionBoundInternalExecutorFactory,
   208  ) {
   209  	r.sessionBoundInternalExecutorFactory = factory
   210  }
   211  
   212  // MetricsStruct returns the metrics for production monitoring of each job type.
   213  // They're all stored as the `metric.Struct` interface because of dependency
   214  // cycles.
   215  func (r *Registry) MetricsStruct() *Metrics {
   216  	return &r.metrics
   217  }
   218  
   219  // CurrentlyRunningJobs returns a slice of the ids of all jobs running on this node.
   220  func (r *Registry) CurrentlyRunningJobs() []int64 {
   221  	r.mu.Lock()
   222  	defer r.mu.Unlock()
   223  	jobs := make([]int64, len(r.mu.jobs))
   224  	i := 0
   225  	for jID := range r.mu.jobs {
   226  		jobs[i] = jID
   227  		i++
   228  	}
   229  	return jobs
   230  }
   231  
   232  // lenientNow returns the timestamp after which we should attempt
   233  // to steal a job from a node whose liveness is failing.  This allows
   234  // jobs coordinated by a node which is temporarily saturated to continue.
   235  func (r *Registry) lenientNow() time.Time {
   236  	// We see this in tests.
   237  	var offset time.Duration
   238  	if r.settings == cluster.NoSettings {
   239  		offset = defaultLeniencySetting
   240  	} else {
   241  		offset = LeniencySetting.Get(&r.settings.SV)
   242  	}
   243  
   244  	return r.clock.Now().GoTime().Add(-offset)
   245  }
   246  
   247  // makeCtx returns a new context from r's ambient context and an associated
   248  // cancel func.
   249  func (r *Registry) makeCtx() (context.Context, func()) {
   250  	return context.WithCancel(r.ac.AnnotateCtx(context.Background()))
   251  }
   252  
   253  func (r *Registry) makeJobID() int64 {
   254  	return int64(builtins.GenerateUniqueInt(r.nodeID.SQLInstanceID()))
   255  }
   256  
   257  // CreateAndStartJob creates and asynchronously starts a job from record. An
   258  // error is returned if the job type has not been registered with
   259  // RegisterConstructor. The ctx passed to this function is not the context the
   260  // job will be started with (canceling ctx will not cause the job to cancel).
   261  func (r *Registry) CreateAndStartJob(
   262  	ctx context.Context, resultsCh chan<- tree.Datums, record Record,
   263  ) (*Job, <-chan error, error) {
   264  	var rj *StartableJob
   265  	if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) {
   266  		rj, err = r.CreateStartableJobWithTxn(ctx, record, txn, resultsCh)
   267  		return err
   268  	}); err != nil {
   269  		return nil, nil, err
   270  	}
   271  	errCh, err := rj.Start(ctx)
   272  	if err != nil {
   273  		return nil, nil, err
   274  	}
   275  	return rj.Job, errCh, nil
   276  }
   277  
   278  // Run starts previously unstarted jobs from a list of scheduled
   279  // jobs. Canceling ctx interrupts the waiting but doesn't cancel the jobs.
   280  func (r *Registry) Run(ctx context.Context, ex sqlutil.InternalExecutor, jobs []int64) error {
   281  	if len(jobs) == 0 {
   282  		return nil
   283  	}
   284  	log.Infof(ctx, "scheduled jobs %+v", jobs)
   285  	buf := bytes.Buffer{}
   286  	for i, id := range jobs {
   287  		select {
   288  		case r.adoptionCh <- struct{}{}:
   289  		case <-ctx.Done():
   290  			return ctx.Err()
   291  		}
   292  
   293  		if i > 0 {
   294  			buf.WriteString(",")
   295  		}
   296  		buf.WriteString(fmt.Sprintf(" %d", id))
   297  	}
   298  	// Manually retry instead of using SHOW JOBS WHEN COMPLETE so we have greater
   299  	// control over retries. Also, avoiding SHOW JOBS prevents us from having to
   300  	// populate the crdb_internal.jobs vtable.
   301  	query := fmt.Sprintf(
   302  		`SELECT count(*) FROM system.jobs WHERE id IN (%s)
   303         AND (status != 'succeeded' AND status != 'failed' AND status != 'canceled')`,
   304  		buf.String())
   305  	for r := retry.StartWithCtx(ctx, retry.Options{
   306  		InitialBackoff: 10 * time.Millisecond,
   307  		MaxBackoff:     1 * time.Second,
   308  		Multiplier:     2,
   309  	}); r.Next(); {
   310  		// We poll the number of queued jobs that aren't finished. As with SHOW JOBS
   311  		// WHEN COMPLETE, if one of the jobs is missing from the jobs table for
   312  		// whatever reason, we'll fail later when we try to load the job.
   313  		row, err := ex.QueryRowEx(
   314  			ctx,
   315  			"poll-show-jobs",
   316  			nil, /* txn */
   317  			sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   318  			query,
   319  		)
   320  		if err != nil {
   321  			return errors.Wrap(err, "polling for queued jobs to complete")
   322  		}
   323  		count := int64(tree.MustBeDInt(row[0]))
   324  		if log.V(3) {
   325  			log.Infof(ctx, "waiting for %d queued jobs to complete", count)
   326  		}
   327  		if count == 0 {
   328  			break
   329  		}
   330  	}
   331  	for i, id := range jobs {
   332  		j, err := r.LoadJob(ctx, id)
   333  		if err != nil {
   334  			return errors.WithHint(
   335  				errors.Wrapf(err, "job %d could not be loaded", jobs[i]),
   336  				"The job may not have succeeded.")
   337  		}
   338  		if j.Payload().FinalResumeError != nil {
   339  			decodedErr := errors.DecodeError(ctx, *j.Payload().FinalResumeError)
   340  			return decodedErr
   341  		}
   342  		if j.Payload().Error != "" {
   343  			return errors.Newf("job %d failed with error: %s", jobs[i], j.Payload().Error)
   344  		}
   345  	}
   346  	return nil
   347  }
   348  
   349  // NewJob creates a new Job.
   350  func (r *Registry) NewJob(record Record) *Job {
   351  	job := &Job{
   352  		registry: r,
   353  	}
   354  	job.mu.payload = jobspb.Payload{
   355  		Description:   record.Description,
   356  		Statement:     record.Statement,
   357  		Username:      record.Username,
   358  		DescriptorIDs: record.DescriptorIDs,
   359  		Details:       jobspb.WrapPayloadDetails(record.Details),
   360  		Noncancelable: record.NonCancelable,
   361  	}
   362  	job.mu.progress = jobspb.Progress{
   363  		Details:       jobspb.WrapProgressDetails(record.Progress),
   364  		RunningStatus: string(record.RunningStatus),
   365  	}
   366  	return job
   367  }
   368  
   369  // CreateJobWithTxn creates a job to be started later with StartJob.
   370  // It stores the job in the jobs table, marks it pending and gives the
   371  // current node a lease.
   372  func (r *Registry) CreateJobWithTxn(ctx context.Context, record Record, txn *kv.Txn) (*Job, error) {
   373  	j := r.NewJob(record)
   374  	if err := j.WithTxn(txn).insert(ctx, r.makeJobID(), r.newLease()); err != nil {
   375  		return nil, err
   376  	}
   377  	return j, nil
   378  }
   379  
   380  // CreateStartableJobWithTxn creates a job to be started later, after the
   381  // creating txn commits. The method uses the passed txn to write the job in the
   382  // jobs table, marks it pending and gives the current node a lease. It
   383  // additionally registers the job with the Registry which will prevent the
   384  // Registry from adopting the job after the transaction commits. The resultsCh
   385  // will be connected to the output of the job and written to after the returned
   386  // StartableJob is started.
   387  //
   388  // The returned job is not associated with the user transaction. The intention
   389  // is that the job will not be modified again in txn. If the transaction is
   390  // committed, the caller must explicitly Start it. If the transaction is rolled
   391  // back then the caller must call CleanupOnRollback to unregister the job from
   392  // the Registry.
   393  func (r *Registry) CreateStartableJobWithTxn(
   394  	ctx context.Context, record Record, txn *kv.Txn, resultsCh chan<- tree.Datums,
   395  ) (*StartableJob, error) {
   396  	j, err := r.CreateJobWithTxn(ctx, record, txn)
   397  	if err != nil {
   398  		return nil, err
   399  	}
   400  	// The job itself must not hold on to this transaction. We ensure in Start()
   401  	// that the transaction used to create the job is committed. When jobs hold
   402  	// onto transactions they use the transaction in methods which modify the job.
   403  	// On the whole this pattern is bug-prone and hard to reason about.
   404  	j.WithTxn(nil)
   405  	resumer, err := r.createResumer(j, r.settings)
   406  	if err != nil {
   407  		return nil, err
   408  	}
   409  	resumerCtx, cancel := r.makeCtx()
   410  	if err := r.register(*j.ID(), cancel); err != nil {
   411  		return nil, err
   412  	}
   413  	return &StartableJob{
   414  		Job:        j,
   415  		txn:        txn,
   416  		resumer:    resumer,
   417  		resumerCtx: resumerCtx,
   418  		cancel:     cancel,
   419  		resultsCh:  resultsCh,
   420  	}, nil
   421  }
   422  
   423  // LoadJob loads an existing job with the given jobID from the system.jobs
   424  // table.
   425  func (r *Registry) LoadJob(ctx context.Context, jobID int64) (*Job, error) {
   426  	return r.LoadJobWithTxn(ctx, jobID, nil)
   427  }
   428  
   429  // LoadJobWithTxn does the same as above, but using the transaction passed in
   430  // the txn argument. Passing a nil transaction is equivalent to calling LoadJob
   431  // in that a transaction will be automatically created.
   432  func (r *Registry) LoadJobWithTxn(ctx context.Context, jobID int64, txn *kv.Txn) (*Job, error) {
   433  	j := &Job{
   434  		id:       &jobID,
   435  		registry: r,
   436  	}
   437  	if err := j.WithTxn(txn).load(ctx); err != nil {
   438  		return nil, err
   439  	}
   440  	return j, nil
   441  }
   442  
   443  // DefaultCancelInterval is a reasonable interval at which to poll this node
   444  // for liveness failures and cancel running jobs.
   445  var DefaultCancelInterval = base.DefaultTxnHeartbeatInterval
   446  
   447  // DefaultAdoptInterval is a reasonable interval at which to poll system.jobs
   448  // for jobs with expired leases.
   449  //
   450  // DefaultAdoptInterval is mutable for testing. NB: Updates to this value after
   451  // Registry.Start has been called will not have any effect.
   452  var DefaultAdoptInterval = 30 * time.Second
   453  
   454  var maxAdoptionsPerLoop = envutil.EnvOrDefaultInt(`COCKROACH_JOB_ADOPTIONS_PER_PERIOD`, 10)
   455  
   456  // gcInterval is how often we check for and delete job records older than the
   457  // retention limit.
   458  const gcInterval = 1 * time.Hour
   459  
   460  // Start polls the current node for liveness failures and cancels all registered
   461  // jobs if it observes a failure. Otherwise it starts all the main daemons of
   462  // registry that poll the jobs table and start/cancel/gc jobs.
   463  func (r *Registry) Start(
   464  	ctx context.Context, stopper *stop.Stopper, cancelInterval, adoptInterval time.Duration,
   465  ) error {
   466  	// Calling maybeCancelJobs once at the start ensures we have an up-to-date
   467  	// liveness epoch before we wait out the first cancelInterval.
   468  	r.maybeCancelJobs(ctx, r.nl)
   469  
   470  	stopper.RunWorker(context.Background(), func(ctx context.Context) {
   471  		for {
   472  			select {
   473  			case <-stopper.ShouldStop():
   474  				return
   475  			case <-time.After(cancelInterval):
   476  				r.maybeCancelJobs(ctx, r.nl)
   477  			}
   478  		}
   479  	})
   480  
   481  	stopper.RunWorker(context.Background(), func(ctx context.Context) {
   482  		for {
   483  			select {
   484  			case <-stopper.ShouldStop():
   485  				return
   486  			case <-time.After(gcInterval):
   487  				old := timeutil.Now().Add(-1 * gcSetting.Get(&r.settings.SV))
   488  				if err := r.cleanupOldJobs(ctx, old); err != nil {
   489  					log.Warningf(ctx, "error cleaning up old job records: %v", err)
   490  				}
   491  			}
   492  		}
   493  	})
   494  
   495  	maybeAdoptJobs := func(ctx context.Context, randomizeJobOrder bool) {
   496  		if r.adoptionDisabled(ctx) {
   497  			r.cancelAll(ctx)
   498  			return
   499  		}
   500  		if err := r.maybeAdoptJob(ctx, r.nl, randomizeJobOrder); err != nil {
   501  			log.Errorf(ctx, "error while adopting jobs: %s", err)
   502  		}
   503  	}
   504  
   505  	stopper.RunWorker(context.Background(), func(ctx context.Context) {
   506  		for {
   507  			select {
   508  			case <-stopper.ShouldStop():
   509  				return
   510  			case <-r.adoptionCh:
   511  				// Try to adopt the most recently created job.
   512  				maybeAdoptJobs(ctx, false /* randomizeJobOrder */)
   513  			case <-time.After(adoptInterval):
   514  				maybeAdoptJobs(ctx, true /* randomizeJobOrder */)
   515  			}
   516  		}
   517  	})
   518  	return nil
   519  }
   520  
   521  func (r *Registry) maybeCancelJobs(ctx context.Context, nlw sqlbase.OptionalNodeLiveness) {
   522  	// Cancel all jobs if the stopper is quiescing.
   523  	select {
   524  	case <-r.stopper.ShouldQuiesce():
   525  		r.cancelAll(ctx)
   526  		return
   527  	default:
   528  	}
   529  
   530  	nl, ok := nlw.Optional(47892)
   531  	if !ok {
   532  		// At most one container is running on behalf of a SQL tenant, so it must be
   533  		// this one, and there's no point canceling anything.
   534  		//
   535  		// TODO(ajwerner): don't rely on this. Instead fix this issue:
   536  		// https://github.com/cockroachdb/cockroach/issues/47892
   537  		return
   538  	}
   539  	liveness, err := nl.Self()
   540  	if err != nil {
   541  		if nodeLivenessLogLimiter.ShouldLog() {
   542  			log.Warningf(ctx, "unable to get node liveness: %s", err)
   543  		}
   544  		// Conservatively assume our lease has expired. Abort all jobs.
   545  		r.cancelAll(ctx)
   546  		return
   547  	}
   548  
   549  	// If we haven't persisted a liveness record within the leniency
   550  	// interval, we'll cancel all of our jobs.
   551  	if !liveness.IsLive(r.lenientNow()) {
   552  		r.mu.Lock()
   553  		defer r.mu.Unlock()
   554  		r.cancelAllLocked(ctx)
   555  		r.mu.epoch = liveness.Epoch
   556  		return
   557  	}
   558  }
   559  
   560  // isOrphaned tries to detect if there are no mutations left to be done for the
   561  // job which will make it a candidate for garbage collection. Jobs can be left
   562  // in such inconsistent state if they fail before being removed from the jobs table.
   563  func (r *Registry) isOrphaned(ctx context.Context, payload *jobspb.Payload) (bool, error) {
   564  	if payload.Type() != jobspb.TypeSchemaChange {
   565  		return false, nil
   566  	}
   567  	for _, id := range payload.DescriptorIDs {
   568  		pendingMutations := false
   569  		if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   570  			td, err := sqlbase.GetTableDescFromID(ctx, txn, keys.TODOSQLCodec, id)
   571  			if err != nil {
   572  				return err
   573  			}
   574  			hasAnyMutations := len(td.GetMutations()) != 0 || len(td.GetGCMutations()) != 0
   575  			hasDropJob := td.DropJobID != 0
   576  			pendingMutations = hasAnyMutations || hasDropJob
   577  			return nil
   578  		}); err != nil {
   579  			if errors.Is(err, sqlbase.ErrDescriptorNotFound) {
   580  				// Treat missing table descriptors as no longer relevant for the
   581  				// job payload. See
   582  				// https://github.com/cockroachdb/cockroach/45399.
   583  				continue
   584  			}
   585  			return false, err
   586  		}
   587  		if pendingMutations {
   588  			return false, nil
   589  		}
   590  	}
   591  	return true, nil
   592  }
   593  
   594  func (r *Registry) cleanupOldJobs(ctx context.Context, olderThan time.Time) error {
   595  	const stmt = `SELECT id, payload, status, created FROM system.jobs WHERE created < $1
   596  		      ORDER BY created LIMIT 1000`
   597  	rows, err := r.ex.Query(ctx, "gc-jobs", nil /* txn */, stmt, olderThan)
   598  	if err != nil {
   599  		return err
   600  	}
   601  
   602  	toDelete := tree.NewDArray(types.Int)
   603  	toDelete.Array = make(tree.Datums, 0, len(rows))
   604  	oldMicros := timeutil.ToUnixMicros(olderThan)
   605  	for _, row := range rows {
   606  		payload, err := UnmarshalPayload(row[1])
   607  		if err != nil {
   608  			return err
   609  		}
   610  		remove := false
   611  		switch Status(*row[2].(*tree.DString)) {
   612  		case StatusRunning, StatusPending:
   613  			done, err := r.isOrphaned(ctx, payload)
   614  			if err != nil {
   615  				return err
   616  			}
   617  			remove = done && row[3].(*tree.DTimestamp).Time.Before(olderThan)
   618  		case StatusSucceeded, StatusCanceled, StatusFailed:
   619  			remove = payload.FinishedMicros < oldMicros
   620  		}
   621  		if remove {
   622  			toDelete.Array = append(toDelete.Array, row[0])
   623  		}
   624  	}
   625  	if len(toDelete.Array) > 0 {
   626  		log.Infof(ctx, "cleaning up %d expired job records", len(toDelete.Array))
   627  		const stmt = `DELETE FROM system.jobs WHERE id = ANY($1)`
   628  		var nDeleted int
   629  		if nDeleted, err = r.ex.Exec(
   630  			ctx, "gc-jobs", nil /* txn */, stmt, toDelete,
   631  		); err != nil {
   632  			return errors.Wrap(err, "deleting old jobs")
   633  		}
   634  		if nDeleted != len(toDelete.Array) {
   635  			return errors.Errorf("asked to delete %d rows but %d were actually deleted",
   636  				len(toDelete.Array), nDeleted)
   637  		}
   638  	}
   639  	return nil
   640  }
   641  
   642  // getJobFn attempts to get a resumer from the given job id. If the job id
   643  // does not have a resumer then it returns an error message suitable for users.
   644  func (r *Registry) getJobFn(ctx context.Context, txn *kv.Txn, id int64) (*Job, Resumer, error) {
   645  	job, err := r.LoadJobWithTxn(ctx, id, txn)
   646  	if err != nil {
   647  		return nil, nil, err
   648  	}
   649  	resumer, err := r.createResumer(job, r.settings)
   650  	if err != nil {
   651  		return job, nil, errors.Errorf("job %d is not controllable", id)
   652  	}
   653  	return job, resumer, nil
   654  }
   655  
   656  // CancelRequested marks the job as cancel-requested using the specified txn (may be nil).
   657  func (r *Registry) CancelRequested(ctx context.Context, txn *kv.Txn, id int64) error {
   658  	job, _, err := r.getJobFn(ctx, txn, id)
   659  	if err != nil {
   660  		// Special case schema change jobs to mark the job as canceled.
   661  		if job != nil {
   662  			payload := job.Payload()
   663  			// TODO(mjibson): Use an unfortunate workaround to enable canceling of
   664  			// schema change jobs by comparing the string description. When a schema
   665  			// change job fails or is canceled, a new job is created with the ROLL BACK
   666  			// prefix. These rollback jobs cannot be canceled. We could add a field to
   667  			// the payload proto to indicate if this job is cancelable or not, but in
   668  			// a split version cluster an older node could pick up the schema change
   669  			// and fail to clear/set that field appropriately. Thus it seems that the
   670  			// safest way for now (i.e., without a larger jobs/schema change refactor)
   671  			// is to hack this up with a string comparison.
   672  			if payload.Type() == jobspb.TypeSchemaChange && !strings.HasPrefix(payload.Description, "ROLL BACK") {
   673  				return job.WithTxn(txn).cancelRequested(ctx, nil)
   674  			}
   675  		}
   676  		return err
   677  	}
   678  	return job.WithTxn(txn).cancelRequested(ctx, nil)
   679  }
   680  
   681  // PauseRequested marks the job with id as paused-requested using the specified txn (may be nil).
   682  func (r *Registry) PauseRequested(ctx context.Context, txn *kv.Txn, id int64) error {
   683  	job, resumer, err := r.getJobFn(ctx, txn, id)
   684  	if err != nil {
   685  		return err
   686  	}
   687  	var onPauseRequested onPauseRequestFunc
   688  	if pr, ok := resumer.(PauseRequester); ok {
   689  		onPauseRequested = pr.OnPauseRequest
   690  	}
   691  	return job.WithTxn(txn).pauseRequested(ctx, onPauseRequested)
   692  }
   693  
   694  // Succeeded marks the job with id as succeeded.
   695  func (r *Registry) Succeeded(ctx context.Context, txn *kv.Txn, id int64) error {
   696  	job, _, err := r.getJobFn(ctx, txn, id)
   697  	if err != nil {
   698  		return err
   699  	}
   700  	return job.WithTxn(txn).succeeded(ctx, nil)
   701  }
   702  
   703  // Failed marks the job with id as failed.
   704  func (r *Registry) Failed(ctx context.Context, txn *kv.Txn, id int64, causingError error) error {
   705  	job, _, err := r.getJobFn(ctx, txn, id)
   706  	if err != nil {
   707  		return err
   708  	}
   709  	return job.WithTxn(txn).failed(ctx, causingError, nil)
   710  }
   711  
   712  // Resume resumes the paused job with id using the specified txn (may be nil).
   713  func (r *Registry) Resume(ctx context.Context, txn *kv.Txn, id int64) error {
   714  	job, _, err := r.getJobFn(ctx, txn, id)
   715  	if err != nil {
   716  		return err
   717  	}
   718  	return job.WithTxn(txn).resumed(ctx)
   719  }
   720  
   721  // Resumer is a resumable job, and is associated with a Job object. Jobs can be
   722  // paused or canceled at any time. Jobs should call their CheckStatus() or
   723  // Progressed() method, which will return an error if the job has been paused or
   724  // canceled.
   725  //
   726  // Resumers are created through registered Constructor functions.
   727  //
   728  type Resumer interface {
   729  	// Resume is called when a job is started or resumed. Sending results on the
   730  	// chan will return them to a user, if a user's session is connected. phs
   731  	// is a sql.PlanHookState.
   732  	Resume(ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums) error
   733  
   734  	// OnFailOrCancel is called when a job fails or is cancel-requested.
   735  	//
   736  	// This method will be called when a registry notices the cancel request,
   737  	// which is not guaranteed to run on the node where the job is running. So it
   738  	// cannot assume that any other methods have been called on this Resumer
   739  	// object.
   740  	OnFailOrCancel(ctx context.Context, phs interface{}) error
   741  }
   742  
   743  // PauseRequester is an extension of Resumer which allows job implementers to inject
   744  // logic during the transaction which moves a job to PauseRequested.
   745  type PauseRequester interface {
   746  	Resumer
   747  
   748  	// OnPauseRequest is called in the transaction that moves a job to PauseRequested.
   749  	// If an error is returned, the pause request will fail. phs is a
   750  	// sql.PlanHookState.
   751  	OnPauseRequest(ctx context.Context, phs interface{}, txn *kv.Txn, details *jobspb.Progress) error
   752  }
   753  
   754  // Constructor creates a resumable job of a certain type. The Resumer is
   755  // created on the coordinator each time the job is started/resumed, so it can
   756  // hold state. The Resume method is always ran, and can set state on the Resumer
   757  // that can be used by the other methods.
   758  type Constructor func(job *Job, settings *cluster.Settings) Resumer
   759  
   760  var constructors = make(map[jobspb.Type]Constructor)
   761  
   762  // RegisterConstructor registers a Resumer constructor for a certain job type.
   763  func RegisterConstructor(typ jobspb.Type, fn Constructor) {
   764  	constructors[typ] = fn
   765  }
   766  
   767  func (r *Registry) createResumer(job *Job, settings *cluster.Settings) (Resumer, error) {
   768  	payload := job.Payload()
   769  	fn := constructors[payload.Type()]
   770  	if fn == nil {
   771  		return nil, errors.Errorf("no resumer is available for %s", payload.Type())
   772  	}
   773  	if wrapper := r.TestingResumerCreationKnobs[payload.Type()]; wrapper != nil {
   774  		return wrapper(fn(job, settings)), nil
   775  	}
   776  	return fn(job, settings), nil
   777  }
   778  
   779  type retryJobError string
   780  
   781  // retryJobErrorSentinel exists so the errors returned from NewRetryJobError can
   782  // be marked with it, allowing more robust detection of retry errors even if
   783  // they are wrapped, etc. This was originally introduced to deal with injected
   784  // retry errors from testing knobs.
   785  var retryJobErrorSentinel = retryJobError("")
   786  
   787  // NewRetryJobError creates a new error that, if returned by a Resumer,
   788  // indicates to the jobs registry that the job should be restarted in the
   789  // background.
   790  func NewRetryJobError(s string) error {
   791  	return errors.Mark(retryJobError(s), retryJobErrorSentinel)
   792  }
   793  
   794  func (r retryJobError) Error() string {
   795  	return string(r)
   796  }
   797  
   798  // stepThroughStateMachine implements the state machine of the job lifecycle.
   799  // The job is executed with the ctx, so ctx must only be canceled if the job
   800  // should also be canceled. resultsCh is passed to the resumable func and should
   801  // be closed by the caller after errCh sends a value. errCh returns an error if
   802  // the job was not completed with success. status is the current job status.
   803  func (r *Registry) stepThroughStateMachine(
   804  	ctx context.Context,
   805  	phs interface{},
   806  	resumer Resumer,
   807  	resultsCh chan<- tree.Datums,
   808  	job *Job,
   809  	status Status,
   810  	jobErr error,
   811  ) error {
   812  	log.Infof(ctx, "job %d: stepping through state %s with error: %v", *job.ID(), status, jobErr)
   813  	switch status {
   814  	case StatusRunning:
   815  		if jobErr != nil {
   816  			return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   817  				"job %d: resuming with non-nil error", *job.ID())
   818  		}
   819  		resumeCtx := logtags.AddTag(ctx, "job", *job.ID())
   820  		err := resumer.Resume(resumeCtx, phs, resultsCh)
   821  		if err == nil {
   822  			return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusSucceeded, nil)
   823  		}
   824  		if resumeCtx.Err() != nil {
   825  			// The context was canceled. Tell the user, but don't attempt to
   826  			// mark the job as failed because it can be resumed by another node.
   827  			//
   828  			// TODO(ajwerner): We'll also end up here if the job was canceled or
   829  			// paused. We should make this error clearer.
   830  			return errors.Errorf("job %d: node liveness error: restarting in background", *job.ID())
   831  		}
   832  		// TODO(spaskob): enforce a limit on retries.
   833  		// TODO(spaskob,lucy): Add metrics on job retries. Consider having a backoff
   834  		// mechanism (possibly combined with a retry limit).
   835  		if errors.Is(err, retryJobErrorSentinel) {
   836  			return errors.Errorf("job %d: %s: restarting in background", *job.ID(), err)
   837  		}
   838  		if sErr := (*InvalidStatusError)(nil); errors.As(err, &sErr) {
   839  			if sErr.status != StatusCancelRequested && sErr.status != StatusPauseRequested {
   840  				return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   841  					"job %d: unexpected status %s provided for a running job", *job.ID(), sErr.status)
   842  			}
   843  			return sErr
   844  		}
   845  		return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusReverting, err)
   846  	case StatusPauseRequested:
   847  		return errors.Errorf("job %s", status)
   848  	case StatusCancelRequested:
   849  		return errors.Errorf("job %s", status)
   850  	case StatusPaused:
   851  		return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   852  			"job %d: unexpected status %s provided to state machine", *job.ID(), status)
   853  	case StatusCanceled:
   854  		if err := job.canceled(ctx, nil); err != nil {
   855  			// If we can't transactionally mark the job as canceled then it will be
   856  			// restarted during the next adopt loop and reverting will be retried.
   857  			return errors.Wrapf(err, "job %d: could not mark as canceled: %v", *job.ID(), jobErr)
   858  		}
   859  		return errors.WithSecondaryError(errors.Errorf("job %s", status), jobErr)
   860  	case StatusSucceeded:
   861  		if jobErr != nil {
   862  			return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   863  				"job %d: successful bu unexpected error provided", *job.ID())
   864  		}
   865  		if err := job.succeeded(ctx, nil); err != nil {
   866  			// If it didn't succeed, we consider the job as failed and need to go
   867  			// through reverting state first.
   868  			// TODO(spaskob): this is silly, we should remove the OnSuccess hooks and
   869  			// execute them in resume so that the client can handle these errors
   870  			// better.
   871  			return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusReverting, errors.Wrapf(err, "could not mark job %d as succeeded", *job.ID()))
   872  		}
   873  		return nil
   874  	case StatusReverting:
   875  		if err := job.reverted(ctx, jobErr, nil); err != nil {
   876  			// If we can't transactionally mark the job as reverting then it will be
   877  			// restarted during the next adopt loop and it will be retried.
   878  			return errors.Wrapf(err, "job %d: could not mark as reverting: %s", *job.ID(), jobErr)
   879  		}
   880  		onFailOrCancelCtx := logtags.AddTag(ctx, "job", *job.ID())
   881  		err := resumer.OnFailOrCancel(onFailOrCancelCtx, phs)
   882  		if successOnFailOrCancel := err == nil; successOnFailOrCancel {
   883  			// If the job has failed with any error different than canceled we
   884  			// mark it as Failed.
   885  			nextStatus := StatusFailed
   886  			if errors.Is(jobErr, errJobCanceled) {
   887  				nextStatus = StatusCanceled
   888  			}
   889  			return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, nextStatus, jobErr)
   890  		}
   891  		if onFailOrCancelCtx.Err() != nil {
   892  			// The context was canceled. Tell the user, but don't attempt to
   893  			// mark the job as failed because it can be resumed by another node.
   894  			return errors.Errorf("job %d: node liveness error: restarting in background", *job.ID())
   895  		}
   896  		if errors.Is(err, retryJobErrorSentinel) {
   897  			return errors.Errorf("job %d: %s: restarting in background", *job.ID(), err)
   898  		}
   899  		if sErr := (*InvalidStatusError)(nil); errors.As(err, &sErr) {
   900  			if sErr.status != StatusPauseRequested {
   901  				return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   902  					"job %d: unexpected status %s provided for a reverting job", *job.ID(), sErr.status)
   903  			}
   904  			return sErr
   905  		}
   906  		return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusFailed, errors.Wrapf(err, "job %d: cannot be reverted, manual cleanup may be required", *job.ID()))
   907  	case StatusFailed:
   908  		if jobErr == nil {
   909  			return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   910  				"job %d: has StatusFailed but no error was provided", *job.ID())
   911  		}
   912  		if err := job.failed(ctx, jobErr, nil); err != nil {
   913  			// If we can't transactionally mark the job as failed then it will be
   914  			// restarted during the next adopt loop and reverting will be retried.
   915  			return errors.Wrapf(err, "job %d: could not mark as failed: %s", *job.ID(), jobErr)
   916  		}
   917  		return jobErr
   918  	default:
   919  		return errors.NewAssertionErrorWithWrappedErrf(jobErr,
   920  			"job %d: has unsupported status %s", *job.ID(), status)
   921  	}
   922  }
   923  
   924  // resume starts or resumes a job. If no error is returned then the job was
   925  // asynchronously executed. The job is executed with the ctx, so ctx must
   926  // only by canceled if the job should also be canceled. resultsCh is passed
   927  // to the resumable func and should be closed by the caller after errCh sends
   928  // a value.
   929  func (r *Registry) resume(
   930  	ctx context.Context, resumer Resumer, resultsCh chan<- tree.Datums, job *Job,
   931  ) (<-chan error, error) {
   932  	errCh := make(chan error, 1)
   933  	taskName := fmt.Sprintf(`job-%d`, *job.ID())
   934  	if err := r.stopper.RunAsyncTask(ctx, taskName, func(ctx context.Context) {
   935  		// Bookkeeping.
   936  		payload := job.Payload()
   937  		phs, cleanup := r.planFn("resume-"+taskName, payload.Username)
   938  		defer cleanup()
   939  		spanName := fmt.Sprintf(`%s-%d`, payload.Type(), *job.ID())
   940  		var span opentracing.Span
   941  		ctx, span = r.ac.AnnotateCtxWithSpan(ctx, spanName)
   942  		defer span.Finish()
   943  
   944  		// Run the actual job.
   945  		status, err := job.CurrentStatus(ctx)
   946  		if err == nil {
   947  			var finalResumeError error
   948  			if job.Payload().FinalResumeError != nil {
   949  				finalResumeError = errors.DecodeError(ctx, *job.Payload().FinalResumeError)
   950  			}
   951  			err = r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, status, finalResumeError)
   952  			if err != nil {
   953  				// TODO (lucy): This needs to distinguish between assertion errors in
   954  				// the job registry and assertion errors in job execution returned from
   955  				// Resume() or OnFailOrCancel(), and only fail on the former. We have
   956  				// tests that purposely introduce bad state in order to produce
   957  				// assertion errors, which shouldn't cause the test to panic. For now,
   958  				// comment this out.
   959  				// if errors.HasAssertionFailure(err) {
   960  				// 	log.ReportOrPanic(ctx, nil, err.Error())
   961  				// }
   962  				log.Errorf(ctx, "job %d: adoption completed with error %v", *job.ID(), err)
   963  			}
   964  			status, err := job.CurrentStatus(ctx)
   965  			if err != nil {
   966  				log.Errorf(ctx, "job %d: failed querying status: %v", *job.ID(), err)
   967  			} else {
   968  				log.Infof(ctx, "job %d: status %s after adoption finished", *job.ID(), status)
   969  			}
   970  		}
   971  		r.unregister(*job.ID())
   972  		errCh <- err
   973  	}); err != nil {
   974  		return nil, err
   975  	}
   976  	return errCh, nil
   977  }
   978  
   979  func (r *Registry) adoptionDisabled(ctx context.Context) bool {
   980  	if r.preventAdoptionFile != "" {
   981  		if _, err := os.Stat(r.preventAdoptionFile); err != nil {
   982  			if !os.IsNotExist(err) {
   983  				log.Warningf(ctx, "error checking if job adoption is currently disabled: %v", err)
   984  			}
   985  			return false
   986  		}
   987  		log.Warningf(ctx, "job adoption is currently disabled by existence of %s", r.preventAdoptionFile)
   988  		return true
   989  	}
   990  	return false
   991  }
   992  
   993  func (r *Registry) maybeAdoptJob(
   994  	ctx context.Context, nlw sqlbase.OptionalNodeLiveness, randomizeJobOrder bool,
   995  ) error {
   996  	const stmt = `
   997  SELECT id, payload, progress IS NULL, status
   998  FROM system.jobs
   999  WHERE status IN ($1, $2, $3, $4, $5) ORDER BY created DESC`
  1000  	rows, err := r.ex.Query(
  1001  		ctx, "adopt-job", nil /* txn */, stmt,
  1002  		StatusPending, StatusRunning, StatusCancelRequested, StatusPauseRequested, StatusReverting,
  1003  	)
  1004  	if err != nil {
  1005  		return errors.Wrap(err, "failed querying for jobs")
  1006  	}
  1007  
  1008  	if randomizeJobOrder {
  1009  		rand.Seed(timeutil.Now().UnixNano())
  1010  		rand.Shuffle(len(rows), func(i, j int) { rows[i], rows[j] = rows[j], rows[i] })
  1011  	}
  1012  
  1013  	type nodeStatus struct {
  1014  		isLive bool
  1015  	}
  1016  	nodeStatusMap := map[roachpb.NodeID]*nodeStatus{
  1017  		// 0 is not a valid node ID, but we treat it as an always-dead node so that
  1018  		// the empty lease (Lease{}) is always considered expired.
  1019  		0: {isLive: false},
  1020  	}
  1021  	// If no liveness is available, adopt all jobs. This is reasonable because this
  1022  	// only affects SQL tenants, which have at most one SQL server running on their
  1023  	// behalf at any given time.
  1024  	if nl, ok := nlw.Optional(47892); ok {
  1025  		// We subtract the leniency interval here to artificially
  1026  		// widen the range of times over which the job registry will
  1027  		// consider the node to be alive.  We rely on the fact that
  1028  		// only a live node updates its own expiration.  Thus, the
  1029  		// expiration time can be used as a reasonable measure of
  1030  		// when the node was last seen.
  1031  		now := r.lenientNow()
  1032  		for _, liveness := range nl.GetLivenesses() {
  1033  			nodeStatusMap[liveness.NodeID] = &nodeStatus{
  1034  				isLive: liveness.IsLive(now),
  1035  			}
  1036  
  1037  			// Don't try to start any more jobs unless we're really live,
  1038  			// otherwise we'd just immediately cancel them.
  1039  			if liveness.NodeID == r.nodeID.DeprecatedNodeID(multiTenancyIssueNo) {
  1040  				if !liveness.IsLive(r.clock.Now().GoTime()) {
  1041  					return errors.Errorf(
  1042  						"trying to adopt jobs on node %d which is not live", liveness.NodeID)
  1043  				}
  1044  			}
  1045  		}
  1046  	}
  1047  
  1048  	if log.V(3) {
  1049  		log.Infof(ctx, "evaluating %d jobs for adoption", len(rows))
  1050  	}
  1051  
  1052  	var adopted int
  1053  	for _, row := range rows {
  1054  		if adopted >= maxAdoptionsPerLoop {
  1055  			// Leave excess jobs for other nodes to get their fair share.
  1056  			break
  1057  		}
  1058  
  1059  		id := (*int64)(row[0].(*tree.DInt))
  1060  
  1061  		payload, err := UnmarshalPayload(row[1])
  1062  		if err != nil {
  1063  			return err
  1064  		}
  1065  
  1066  		status := Status(tree.MustBeDString(row[3]))
  1067  		if log.V(3) {
  1068  			log.Infof(ctx, "job %d: evaluating for adoption with status `%s` and lease %v",
  1069  				*id, status, payload.Lease)
  1070  		}
  1071  
  1072  		// In version 20.1, the registry must not adopt 19.2-style schema change
  1073  		// jobs until they've undergone a migration.
  1074  		// TODO (lucy): Remove this in 20.2.
  1075  		if isOldSchemaChangeJob(payload) {
  1076  			log.VEventf(ctx, 2, "job %d: skipping adoption because schema change job has not been migrated", id)
  1077  			continue
  1078  		}
  1079  
  1080  		if payload.Lease == nil {
  1081  			// If the lease is missing, it simply means the job does not yet support
  1082  			// resumability.
  1083  			if log.V(2) {
  1084  				log.Infof(ctx, "job %d: skipping: nil lease", *id)
  1085  			}
  1086  			continue
  1087  		}
  1088  
  1089  		// If the job has no progress it is from a 2.0 cluster. If the entire cluster
  1090  		// has been upgraded to 2.1 then we know nothing is running the job and it
  1091  		// can be safely failed.
  1092  		if nullProgress, ok := row[2].(*tree.DBool); ok && bool(*nullProgress) {
  1093  			log.Warningf(ctx, "job %d predates cluster upgrade and must be re-run", *id)
  1094  			versionErr := errors.New("job predates cluster upgrade and must be re-run")
  1095  			payload.Error = versionErr.Error()
  1096  			payloadBytes, err := protoutil.Marshal(payload)
  1097  			if err != nil {
  1098  				return err
  1099  			}
  1100  
  1101  			// We can't use job.update here because it fails while attempting to unmarshal
  1102  			// the progress. Setting the status to failed is idempotent so we don't care
  1103  			// if multiple nodes execute this.
  1104  			const updateStmt = `UPDATE system.jobs SET status = $1, payload = $2 WHERE id = $3`
  1105  			updateArgs := []interface{}{StatusFailed, payloadBytes, *id}
  1106  			err = r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1107  				_, err := r.ex.Exec(ctx, "job-update", txn, updateStmt, updateArgs...)
  1108  				return err
  1109  			})
  1110  			if err != nil {
  1111  				log.Warningf(ctx, "job %d: has no progress but unable to mark failed: %s", *id, err)
  1112  			}
  1113  			continue
  1114  		}
  1115  
  1116  		r.mu.Lock()
  1117  		_, runningOnNode := r.mu.jobs[*id]
  1118  		r.mu.Unlock()
  1119  
  1120  		// If we're running as a tenant (!ok), then we are the sole SQL server in
  1121  		// charge of its jobs and ought to adopt all of them. Otherwise, look more
  1122  		// closely at who is running the job and whether to adopt.
  1123  		if nodeID, ok := r.nodeID.OptionalNodeID(); ok && nodeID != payload.Lease.NodeID {
  1124  			// Another node holds the lease on the job, see if we should steal it.
  1125  			if runningOnNode {
  1126  				// If we are currently running a job that another node has the lease on,
  1127  				// stop running it.
  1128  				log.Warningf(ctx, "job %d: node %d owns lease; canceling", *id, payload.Lease.NodeID)
  1129  				r.unregister(*id)
  1130  				continue
  1131  			}
  1132  			nodeStatus, ok := nodeStatusMap[payload.Lease.NodeID]
  1133  			if !ok {
  1134  				// This case should never happen.
  1135  				log.ReportOrPanic(ctx, nil, "job %d: skipping: no liveness record for the job's node %d",
  1136  					log.Safe(*id), payload.Lease.NodeID)
  1137  				continue
  1138  			}
  1139  			if nodeStatus.isLive {
  1140  				if log.V(2) {
  1141  					log.Infof(ctx, "job %d: skipping: another node is live and holds the lease", *id)
  1142  				}
  1143  				continue
  1144  			}
  1145  		}
  1146  
  1147  		// Below we know that this node holds the lease on the job, or that we want
  1148  		// to adopt it anyway because the leaseholder seems dead.
  1149  		job := &Job{id: id, registry: r}
  1150  		resumeCtx, cancel := r.makeCtx()
  1151  
  1152  		if pauseRequested := status == StatusPauseRequested; pauseRequested {
  1153  			if err := job.paused(ctx, func(context.Context, *kv.Txn) error {
  1154  				r.unregister(*id)
  1155  				return nil
  1156  			}); err != nil {
  1157  				log.Errorf(ctx, "job %d: could not set to paused: %v", *id, err)
  1158  				continue
  1159  			}
  1160  			log.Infof(ctx, "job %d: paused", *id)
  1161  			continue
  1162  		}
  1163  
  1164  		if cancelRequested := status == StatusCancelRequested; cancelRequested {
  1165  			if err := job.reverted(ctx, errJobCanceled, func(context.Context, *kv.Txn) error {
  1166  				// Unregister the job in case it is running on the node.
  1167  				// Unregister is a no-op for jobs that are not running.
  1168  				r.unregister(*id)
  1169  				return nil
  1170  			}); err != nil {
  1171  				log.Errorf(ctx, "job %d: could not set to reverting: %v", *id, err)
  1172  				continue
  1173  			}
  1174  			log.Infof(ctx, "job %d: canceled: the job is now reverting", *id)
  1175  		} else if currentlyRunning := r.register(*id, cancel) != nil; currentlyRunning {
  1176  			if log.V(3) {
  1177  				log.Infof(ctx, "job %d: skipping: the job is already running/reverting on this node", *id)
  1178  			}
  1179  			continue
  1180  		}
  1181  
  1182  		// Check if job status has changed in the meanwhile.
  1183  		currentStatus, err := job.CurrentStatus(ctx)
  1184  		if err != nil {
  1185  			return err
  1186  		}
  1187  		if status != currentStatus {
  1188  			continue
  1189  		}
  1190  		// Adopt job and resume/revert it.
  1191  		if err := job.adopt(ctx, payload.Lease); err != nil {
  1192  			r.unregister(*id)
  1193  			return errors.Wrap(err, "unable to acquire lease")
  1194  		}
  1195  
  1196  		resultsCh := make(chan tree.Datums)
  1197  		resumer, err := r.createResumer(job, r.settings)
  1198  		if err != nil {
  1199  			r.unregister(*id)
  1200  			return err
  1201  		}
  1202  		log.Infof(ctx, "job %d: resuming execution", *id)
  1203  		errCh, err := r.resume(resumeCtx, resumer, resultsCh, job)
  1204  		if err != nil {
  1205  			r.unregister(*id)
  1206  			return err
  1207  		}
  1208  		go func() {
  1209  			// Drain and ignore results.
  1210  			for range resultsCh {
  1211  			}
  1212  		}()
  1213  		go func() {
  1214  			// Wait for the job to finish. No need to print the error because if there
  1215  			// was one it's been set in the job status already.
  1216  			<-errCh
  1217  			close(resultsCh)
  1218  		}()
  1219  
  1220  		adopted++
  1221  	}
  1222  
  1223  	return nil
  1224  }
  1225  
  1226  func (r *Registry) newLease() *jobspb.Lease {
  1227  	nodeID := r.nodeID.DeprecatedNodeID(multiTenancyIssueNo)
  1228  	if nodeID == 0 {
  1229  		panic("jobs.Registry has empty node ID")
  1230  	}
  1231  	r.mu.Lock()
  1232  	defer r.mu.Unlock()
  1233  	return &jobspb.Lease{NodeID: nodeID, Epoch: r.mu.epoch}
  1234  }
  1235  
  1236  func (r *Registry) cancelAll(ctx context.Context) {
  1237  	r.mu.Lock()
  1238  	defer r.mu.Unlock()
  1239  	r.cancelAllLocked(ctx)
  1240  }
  1241  
  1242  func (r *Registry) cancelAllLocked(ctx context.Context) {
  1243  	r.mu.AssertHeld()
  1244  	for jobID, cancel := range r.mu.jobs {
  1245  		log.Warningf(ctx, "job %d: canceling due to liveness failure", jobID)
  1246  		cancel()
  1247  	}
  1248  	r.mu.jobs = make(map[int64]context.CancelFunc)
  1249  }
  1250  
  1251  // register registers an about to be resumed job in memory so that it can be
  1252  // killed and that no one else tries to resume it. This essentially works as a
  1253  // barrier that only one function can cross and try to resume the job.
  1254  func (r *Registry) register(jobID int64, cancel func()) error {
  1255  	r.mu.Lock()
  1256  	defer r.mu.Unlock()
  1257  	// We need to prevent different routines trying to adopt and resume the job.
  1258  	if _, alreadyRegistered := r.mu.jobs[jobID]; alreadyRegistered {
  1259  		return errors.Errorf("job %d: already registered", jobID)
  1260  	}
  1261  	r.mu.jobs[jobID] = cancel
  1262  	return nil
  1263  }
  1264  
  1265  func (r *Registry) unregister(jobID int64) {
  1266  	r.mu.Lock()
  1267  	defer r.mu.Unlock()
  1268  	cancel, ok := r.mu.jobs[jobID]
  1269  	// It is possible for a job to be double unregistered. unregister is always
  1270  	// called at the end of resume. But it can also be called during cancelAll
  1271  	// and in the adopt loop under certain circumstances.
  1272  	if ok {
  1273  		cancel()
  1274  		delete(r.mu.jobs, jobID)
  1275  	}
  1276  }
  1277  
  1278  // TestingNudgeAdoptionQueue is used by tests to tell the registry that there is
  1279  // a job to be adopted.
  1280  func (r *Registry) TestingNudgeAdoptionQueue() {
  1281  	r.adoptionCh <- struct{}{}
  1282  }