github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/jobs.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package jobs
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"reflect"
    17  	"sync/atomic"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    20  	"github.com/cockroachdb/cockroach/pkg/kv"
    21  	"github.com/cockroachdb/cockroach/pkg/security"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlutil"
    26  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    27  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    28  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    29  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    30  	"github.com/cockroachdb/errors"
    31  )
    32  
    33  // Job manages logging the progress of long-running system processes, like
    34  // backups and restores, to the system.jobs table.
    35  type Job struct {
    36  	// TODO(benesch): avoid giving Job a reference to Registry. This will likely
    37  	// require inverting control: rather than having the worker call Created,
    38  	// Started, etc., have Registry call a setupFn and a workFn as appropriate.
    39  	registry *Registry
    40  
    41  	id  *int64
    42  	txn *kv.Txn
    43  	mu  struct {
    44  		syncutil.Mutex
    45  		payload  jobspb.Payload
    46  		progress jobspb.Progress
    47  	}
    48  }
    49  
    50  // Record bundles together the user-managed fields in jobspb.Payload.
    51  type Record struct {
    52  	Description   string
    53  	Statement     string
    54  	Username      string
    55  	DescriptorIDs sqlbase.IDs
    56  	Details       jobspb.Details
    57  	Progress      jobspb.ProgressDetails
    58  	RunningStatus RunningStatus
    59  	// NonCancelable is used to denote when a job cannot be canceled. This field
    60  	// will not be respected in mixed version clusters where some nodes have
    61  	// a version < 20.1, so it can only be used in cases where all nodes having
    62  	// versions >= 20.1 is guaranteed.
    63  	NonCancelable bool
    64  }
    65  
    66  // StartableJob is a job created with a transaction to be started later.
    67  // See Registry.CreateStartableJob
    68  type StartableJob struct {
    69  	*Job
    70  	txn        *kv.Txn
    71  	resumer    Resumer
    72  	resumerCtx context.Context
    73  	cancel     context.CancelFunc
    74  	resultsCh  chan<- tree.Datums
    75  	starts     int64 // used to detect multiple calls to Start()
    76  }
    77  
    78  func init() {
    79  	// NB: This exists to make the jobs payload usable during testrace. See the
    80  	// comment on protoutil.Clone and the implementation of Marshal when run under
    81  	// race.
    82  	var jobPayload jobspb.Payload
    83  	jobsDetailsInterfaceType := reflect.TypeOf(&jobPayload.Details).Elem()
    84  	var jobProgress jobspb.Progress
    85  	jobsProgressDetailsInterfaceType := reflect.TypeOf(&jobProgress.Details).Elem()
    86  	protoutil.RegisterUnclonableType(jobsDetailsInterfaceType, reflect.Array)
    87  	protoutil.RegisterUnclonableType(jobsProgressDetailsInterfaceType, reflect.Array)
    88  
    89  }
    90  
    91  // Status represents the status of a job in the system.jobs table.
    92  type Status string
    93  
    94  // RunningStatus represents the more detailed status of a running job in
    95  // the system.jobs table.
    96  type RunningStatus string
    97  
    98  const (
    99  	// StatusPending is for jobs that have been created but on which work has
   100  	// not yet started.
   101  	StatusPending Status = "pending"
   102  	// StatusRunning is for jobs that are currently in progress.
   103  	StatusRunning Status = "running"
   104  	// StatusPaused is for jobs that are not currently performing work, but have
   105  	// saved their state and can be resumed by the user later.
   106  	StatusPaused Status = "paused"
   107  	// StatusFailed is for jobs that failed.
   108  	StatusFailed Status = "failed"
   109  	// StatusReverting is for jobs that failed or were canceled and their changes are being
   110  	// being reverted.
   111  	StatusReverting Status = "reverting"
   112  	// StatusSucceeded is for jobs that have successfully completed.
   113  	StatusSucceeded Status = "succeeded"
   114  	// StatusCanceled is for jobs that were explicitly canceled by the user and
   115  	// cannot be resumed.
   116  	StatusCanceled Status = "canceled"
   117  	// StatusCancelRequested is for jobs that were requested to be canceled by
   118  	// the user but may be still running Resume. The node that is running the job
   119  	// will change it to StatusReverting the next time it runs maybeAdoptJobs.
   120  	StatusCancelRequested Status = "cancel-requested"
   121  	// StatusPauseRequested is for jobs that were requested to be paused by the
   122  	// user but may be still resuming or reverting. The node that is running the
   123  	// job will change its state to StatusPaused the next time it runs
   124  	// maybeAdoptJobs and will stop running it.
   125  	StatusPauseRequested Status = "pause-requested"
   126  )
   127  
   128  var (
   129  	errJobCanceled = errors.New("job canceled by user")
   130  )
   131  
   132  // isOldSchemaChangeJob returns whether the provided payload is for a job that
   133  // is a 19.2-style schema change, and therefore cannot be run or updated in 20.1
   134  // (without first having undergone a migration).
   135  // TODO (lucy): Remove this in 20.2. (I think it's possible in theory for a 19.2
   136  // schema change job to persist on a 20.1 cluster indefinitely, since the
   137  // migration is asynchronous, so this will take some care beyond just removing
   138  // the format version gate.)
   139  func isOldSchemaChangeJob(payload *jobspb.Payload) bool {
   140  	schemaChangeDetails, ok := payload.UnwrapDetails().(jobspb.SchemaChangeDetails)
   141  	return ok && schemaChangeDetails.FormatVersion < jobspb.JobResumerFormatVersion
   142  }
   143  
   144  // Terminal returns whether this status represents a "terminal" state: a state
   145  // after which the job should never be updated again.
   146  func (s Status) Terminal() bool {
   147  	return s == StatusFailed || s == StatusSucceeded || s == StatusCanceled
   148  }
   149  
   150  // InvalidStatusError is the error returned when the desired operation is
   151  // invalid given the job's current status.
   152  type InvalidStatusError struct {
   153  	id     int64
   154  	status Status
   155  	op     string
   156  	err    string
   157  }
   158  
   159  func (e *InvalidStatusError) Error() string {
   160  	if e.err != "" {
   161  		return fmt.Sprintf("cannot %s %s job (id %d, err: %q)", e.op, e.status, e.id, e.err)
   162  	}
   163  	return fmt.Sprintf("cannot %s %s job (id %d)", e.op, e.status, e.id)
   164  }
   165  
   166  // SimplifyInvalidStatusError unwraps an *InvalidStatusError into an error
   167  // message suitable for users. Other errors are returned as passed.
   168  func SimplifyInvalidStatusError(err error) error {
   169  	if ierr := (*InvalidStatusError)(nil); errors.As(err, &ierr) {
   170  		return errors.Errorf("job %s", ierr.status)
   171  	}
   172  	return err
   173  }
   174  
   175  // ID returns the ID of the job that this Job is currently tracking. This will
   176  // be nil if Created has not yet been called.
   177  func (j *Job) ID() *int64 {
   178  	return j.id
   179  }
   180  
   181  // Created records the creation of a new job in the system.jobs table and
   182  // remembers the assigned ID of the job in the Job. The job information is read
   183  // from the Record field at the time Created is called.
   184  func (j *Job) created(ctx context.Context) error {
   185  	if j.ID() != nil {
   186  		return errors.Errorf("job already created with ID %v", *j.ID())
   187  	}
   188  	return j.insert(ctx, j.registry.makeJobID(), nil /* lease */)
   189  }
   190  
   191  // Started marks the tracked job as started.
   192  func (j *Job) started(ctx context.Context) error {
   193  	return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   194  		if md.Status != StatusPending && md.Status != StatusRunning {
   195  			return errors.Errorf("job with status %s cannot be marked started", md.Status)
   196  		}
   197  		// TODO(spaskob): Remove this status change after we stop supporting
   198  		// pending job states.
   199  		ju.UpdateStatus(StatusRunning)
   200  		md.Payload.StartedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime())
   201  		ju.UpdatePayload(md.Payload)
   202  		return nil
   203  	})
   204  }
   205  
   206  // CheckStatus verifies the status of the job and returns an error if the job's
   207  // status isn't Running or Reverting.
   208  func (j *Job) CheckStatus(ctx context.Context) error {
   209  	return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, _ *JobUpdater) error {
   210  		return md.CheckRunningOrReverting()
   211  	})
   212  }
   213  
   214  // CheckTerminalStatus returns true if the job is in a terminal status.
   215  func (j *Job) CheckTerminalStatus(ctx context.Context) bool {
   216  	err := j.Update(ctx, func(_ *kv.Txn, md JobMetadata, _ *JobUpdater) error {
   217  		if !md.Status.Terminal() {
   218  			return &InvalidStatusError{md.ID, md.Status, "checking that job status is success", md.Payload.Error}
   219  		}
   220  		return nil
   221  	})
   222  
   223  	return err == nil
   224  }
   225  
   226  // RunningStatus updates the detailed status of a job currently in progress.
   227  // It sets the job's RunningStatus field to the value returned by runningStatusFn
   228  // and persists runningStatusFn's modifications to the job's details, if any.
   229  func (j *Job) RunningStatus(ctx context.Context, runningStatusFn RunningStatusFn) error {
   230  	return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   231  		if err := md.CheckRunningOrReverting(); err != nil {
   232  			return err
   233  		}
   234  		runningStatus, err := runningStatusFn(ctx, md.Progress.Details)
   235  		if err != nil {
   236  			return err
   237  		}
   238  		md.Progress.RunningStatus = string(runningStatus)
   239  		ju.UpdateProgress(md.Progress)
   240  		return nil
   241  	})
   242  }
   243  
   244  // SetDescription updates the description of a created job.
   245  func (j *Job) SetDescription(ctx context.Context, updateFn DescriptionUpdateFn) error {
   246  	return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   247  		prev := md.Payload.Description
   248  		desc, err := updateFn(ctx, prev)
   249  		if err != nil {
   250  			return err
   251  		}
   252  		if prev != desc {
   253  			md.Payload.Description = desc
   254  			ju.UpdatePayload(md.Payload)
   255  		}
   256  		return nil
   257  	})
   258  }
   259  
   260  // RunningStatusFn is a callback that computes a job's running status
   261  // given its details. It is safe to modify details in the callback; those
   262  // modifications will be automatically persisted to the database record.
   263  type RunningStatusFn func(ctx context.Context, details jobspb.Details) (RunningStatus, error)
   264  
   265  // DescriptionUpdateFn is a callback that computes a job's description
   266  // given its current one.
   267  type DescriptionUpdateFn func(ctx context.Context, description string) (string, error)
   268  
   269  // FractionProgressedFn is a callback that computes a job's completion fraction
   270  // given its details. It is safe to modify details in the callback; those
   271  // modifications will be automatically persisted to the database record.
   272  type FractionProgressedFn func(ctx context.Context, details jobspb.ProgressDetails) float32
   273  
   274  // FractionUpdater returns a FractionProgressedFn that returns its argument.
   275  func FractionUpdater(f float32) FractionProgressedFn {
   276  	return func(ctx context.Context, details jobspb.ProgressDetails) float32 {
   277  		return f
   278  	}
   279  }
   280  
   281  // HighWaterProgressedFn is a callback that computes a job's high-water mark
   282  // given its details. It is safe to modify details in the callback; those
   283  // modifications will be automatically persisted to the database record.
   284  type HighWaterProgressedFn func(ctx context.Context, txn *kv.Txn, details jobspb.ProgressDetails) (hlc.Timestamp, error)
   285  
   286  // FractionProgressed updates the progress of the tracked job. It sets the job's
   287  // FractionCompleted field to the value returned by progressedFn and persists
   288  // progressedFn's modifications to the job's progress details, if any.
   289  //
   290  // Jobs for which progress computations do not depend on their details can
   291  // use the FractionUpdater helper to construct a ProgressedFn.
   292  func (j *Job) FractionProgressed(ctx context.Context, progressedFn FractionProgressedFn) error {
   293  	return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   294  		if err := md.CheckRunningOrReverting(); err != nil {
   295  			return err
   296  		}
   297  		fractionCompleted := progressedFn(ctx, md.Progress.Details)
   298  		// allow for slight floating-point rounding inaccuracies
   299  		if fractionCompleted > 1.0 && fractionCompleted < 1.01 {
   300  			fractionCompleted = 1.0
   301  		}
   302  		if fractionCompleted < 0.0 || fractionCompleted > 1.0 {
   303  			return errors.Errorf(
   304  				"Job: fractionCompleted %f is outside allowable range [0.0, 1.0] (job %d)",
   305  				fractionCompleted, *j.ID(),
   306  			)
   307  		}
   308  		md.Progress.Progress = &jobspb.Progress_FractionCompleted{
   309  			FractionCompleted: fractionCompleted,
   310  		}
   311  		ju.UpdateProgress(md.Progress)
   312  		return nil
   313  	})
   314  }
   315  
   316  // HighWaterProgressed updates the progress of the tracked job. It sets the
   317  // job's HighWater field to the value returned by progressedFn and persists
   318  // progressedFn's modifications to the job's progress details, if any.
   319  func (j *Job) HighWaterProgressed(ctx context.Context, progressedFn HighWaterProgressedFn) error {
   320  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   321  		if err := md.CheckRunningOrReverting(); err != nil {
   322  			return err
   323  		}
   324  		highWater, err := progressedFn(ctx, txn, md.Progress.Details)
   325  		if err != nil {
   326  			return err
   327  		}
   328  		if highWater.Less(hlc.Timestamp{}) {
   329  			return errors.Errorf(
   330  				"Job: high-water %s is outside allowable range > 0.0 (job %d)",
   331  				highWater, *j.ID(),
   332  			)
   333  		}
   334  		md.Progress.Progress = &jobspb.Progress_HighWater{
   335  			HighWater: &highWater,
   336  		}
   337  		ju.UpdateProgress(md.Progress)
   338  		return nil
   339  	})
   340  }
   341  
   342  // paused sets the status of the tracked job to paused. It is called by the
   343  // registry adoption loop by the node currently running a job to move it from
   344  // pauseRequested to paused.
   345  func (j *Job) paused(ctx context.Context, fn func(context.Context, *kv.Txn) error) error {
   346  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   347  		if md.Status == StatusPaused {
   348  			// Already paused - do nothing.
   349  			return nil
   350  		}
   351  		if md.Status != StatusPauseRequested {
   352  			return fmt.Errorf("job with status %s cannot be set to paused", md.Status)
   353  		}
   354  		if fn != nil {
   355  			if err := fn(ctx, txn); err != nil {
   356  				return err
   357  			}
   358  		}
   359  		ju.UpdateStatus(StatusPaused)
   360  		return nil
   361  	})
   362  }
   363  
   364  // resumed sets the status of the tracked job to running or reverting iff the
   365  // job is currently paused. It does not directly resume the job; rather, it
   366  // expires the job's lease so that a Registry adoption loop detects it and
   367  // resumes it.
   368  func (j *Job) resumed(ctx context.Context) error {
   369  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   370  		if md.Status == StatusRunning || md.Status == StatusReverting {
   371  			// Already resumed - do nothing.
   372  			return nil
   373  		}
   374  		if md.Status != StatusPaused {
   375  			return fmt.Errorf("job with status %s cannot be resumed", md.Status)
   376  		}
   377  		// We use the absence of error to determine what state we should
   378  		// resume into.
   379  		if md.Payload.FinalResumeError == nil {
   380  			ju.UpdateStatus(StatusRunning)
   381  		} else {
   382  			ju.UpdateStatus(StatusReverting)
   383  		}
   384  		// NB: A nil lease indicates the job is not resumable, whereas an empty
   385  		// lease is always considered expired.
   386  		md.Payload.Lease = &jobspb.Lease{}
   387  		ju.UpdatePayload(md.Payload)
   388  		return nil
   389  	})
   390  }
   391  
   392  // cancelRequested sets the status of the tracked job to cancel-requested. It
   393  // does not directly cancel the job; like job.Paused, it expects the job to call
   394  // job.Progressed soon, observe a "job is cancel-requested" error, and abort.
   395  // Further the node the runs the job will actively cancel it when it notices
   396  // that it is in state StatusCancelRequested and will move it to state
   397  // StatusReverting.
   398  func (j *Job) cancelRequested(ctx context.Context, fn func(context.Context, *kv.Txn) error) error {
   399  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   400  		// Don't allow 19.2-style schema change jobs to undergo changes in job state
   401  		// before they undergo a migration to make them properly runnable in 20.1 and
   402  		// later versions. While we could support cancellation in principle, the
   403  		// point is to cut down on the number of possible states that the migration
   404  		// could encounter.
   405  		//
   406  		// TODO (lucy): Remove this in 20.2.
   407  		if isOldSchemaChangeJob(md.Payload) {
   408  			return errors.Newf(
   409  				"schema change job was created in earlier version, and cannot be " +
   410  					"canceled in this version until the upgrade is finalized and an internal migration is complete")
   411  		}
   412  
   413  		if md.Payload.Noncancelable {
   414  			return errors.Newf("job %d: not cancelable", *j.ID())
   415  		}
   416  		if md.Status == StatusCancelRequested || md.Status == StatusCanceled {
   417  			return nil
   418  		}
   419  		if md.Status != StatusPending && md.Status != StatusRunning && md.Status != StatusPaused {
   420  			return fmt.Errorf("job with status %s cannot be requested to be canceled", md.Status)
   421  		}
   422  		if md.Status == StatusPaused && md.Payload.FinalResumeError != nil {
   423  			decodedErr := errors.DecodeError(ctx, *md.Payload.FinalResumeError)
   424  			return fmt.Errorf("job %d is paused and has non-nil FinalResumeError %s hence cannot be canceled and should be reverted", *j.ID(), decodedErr.Error())
   425  		}
   426  		if fn != nil {
   427  			if err := fn(ctx, txn); err != nil {
   428  				return err
   429  			}
   430  		}
   431  		ju.UpdateStatus(StatusCancelRequested)
   432  		return nil
   433  	})
   434  }
   435  
   436  // onPauseRequestFunc is a function used to perform action on behalf of a job
   437  // implementation when a pause is requested.
   438  type onPauseRequestFunc func(
   439  	ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress,
   440  ) error
   441  
   442  // pauseRequested sets the status of the tracked job to pause-requested. It does
   443  // not directly pause the job; it expects the node that runs the job will
   444  // actively cancel it when it notices that it is in state StatusPauseRequested
   445  // and will move it to state StatusPaused.
   446  func (j *Job) pauseRequested(ctx context.Context, fn onPauseRequestFunc) error {
   447  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   448  		// Don't allow 19.2-style schema change jobs to undergo changes in job state
   449  		// before they undergo a migration to make them properly runnable in 20.1 and
   450  		// later versions.
   451  		//
   452  		// In particular, schema change jobs could not be paused in 19.2, so allowing
   453  		// pausing here could break backward compatibility during an upgrade by
   454  		// forcing 19.2 nodes to deal with a schema change job in a state that wasn't
   455  		// possible in 19.2.
   456  		//
   457  		// TODO (lucy): Remove this in 20.2.
   458  		if isOldSchemaChangeJob(md.Payload) {
   459  			return errors.Newf(
   460  				"schema change job was created in earlier version, and cannot be " +
   461  					"paused in this version until the upgrade is finalized and an internal migration is complete")
   462  		}
   463  
   464  		if md.Status == StatusPauseRequested || md.Status == StatusPaused {
   465  			return nil
   466  		}
   467  		if md.Status != StatusPending && md.Status != StatusRunning && md.Status != StatusReverting {
   468  			return fmt.Errorf("job with status %s cannot be requested to be paused", md.Status)
   469  		}
   470  		if fn != nil {
   471  			phs, cleanup := j.registry.planFn("pause request", j.Payload().Username)
   472  			defer cleanup()
   473  			if err := fn(ctx, phs, txn, md.Progress); err != nil {
   474  				return err
   475  			}
   476  			ju.UpdateProgress(md.Progress)
   477  		}
   478  		ju.UpdateStatus(StatusPauseRequested)
   479  		return nil
   480  	})
   481  }
   482  
   483  // reverted sets the status of the tracked job to reverted.
   484  func (j *Job) reverted(
   485  	ctx context.Context, err error, fn func(context.Context, *kv.Txn) error,
   486  ) error {
   487  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   488  		if md.Status == StatusReverting {
   489  			return nil
   490  		}
   491  		if md.Status != StatusCancelRequested && md.Status != StatusRunning && md.Status != StatusPending {
   492  			return fmt.Errorf("job with status %s cannot be reverted", md.Status)
   493  		}
   494  		if fn != nil {
   495  			if err := fn(ctx, txn); err != nil {
   496  				return err
   497  			}
   498  		}
   499  		if err != nil {
   500  			md.Payload.Error = err.Error()
   501  			encodedErr := errors.EncodeError(ctx, err)
   502  			md.Payload.FinalResumeError = &encodedErr
   503  			ju.UpdatePayload(md.Payload)
   504  		} else {
   505  			if md.Payload.FinalResumeError == nil {
   506  				return errors.AssertionFailedf(
   507  					"tried to mark job as reverting, but no error was provided or recorded")
   508  			}
   509  		}
   510  		ju.UpdateStatus(StatusReverting)
   511  		return nil
   512  	})
   513  }
   514  
   515  // Canceled sets the status of the tracked job to cancel.
   516  func (j *Job) canceled(ctx context.Context, fn func(context.Context, *kv.Txn) error) error {
   517  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   518  		if md.Status == StatusCanceled {
   519  			return nil
   520  		}
   521  		if md.Status != StatusReverting {
   522  			return fmt.Errorf("job with status %s cannot be requested to be canceled", md.Status)
   523  		}
   524  		if fn != nil {
   525  			if err := fn(ctx, txn); err != nil {
   526  				return err
   527  			}
   528  		}
   529  		ju.UpdateStatus(StatusCanceled)
   530  		md.Payload.FinishedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime())
   531  		ju.UpdatePayload(md.Payload)
   532  		return nil
   533  	})
   534  }
   535  
   536  // Failed marks the tracked job as having failed with the given error.
   537  func (j *Job) failed(
   538  	ctx context.Context, err error, fn func(context.Context, *kv.Txn) error,
   539  ) error {
   540  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   541  		// TODO(spaskob): should we fail if the terminal state is not StatusFailed?
   542  		if md.Status.Terminal() {
   543  			// Already done - do nothing.
   544  			return nil
   545  		}
   546  		if fn != nil {
   547  			if err := fn(ctx, txn); err != nil {
   548  				return err
   549  			}
   550  		}
   551  		ju.UpdateStatus(StatusFailed)
   552  		md.Payload.Error = err.Error()
   553  		md.Payload.FinishedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime())
   554  		ju.UpdatePayload(md.Payload)
   555  		return nil
   556  	})
   557  }
   558  
   559  // succeeded marks the tracked job as having succeeded and sets its fraction
   560  // completed to 1.0.
   561  func (j *Job) succeeded(ctx context.Context, fn func(context.Context, *kv.Txn) error) error {
   562  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   563  		if md.Status == StatusSucceeded {
   564  			return nil
   565  		}
   566  		if md.Status != StatusRunning && md.Status != StatusPending {
   567  			return errors.Errorf("Job with status %s cannot be marked as succeeded", md.Status)
   568  		}
   569  		if fn != nil {
   570  			if err := fn(ctx, txn); err != nil {
   571  				return err
   572  			}
   573  		}
   574  		ju.UpdateStatus(StatusSucceeded)
   575  		md.Payload.FinishedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime())
   576  		ju.UpdatePayload(md.Payload)
   577  		md.Progress.Progress = &jobspb.Progress_FractionCompleted{
   578  			FractionCompleted: 1.0,
   579  		}
   580  		ju.UpdateProgress(md.Progress)
   581  		return nil
   582  	})
   583  }
   584  
   585  // SetDetails sets the details field of the currently running tracked job.
   586  func (j *Job) SetDetails(ctx context.Context, details interface{}) error {
   587  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   588  		md.Payload.Details = jobspb.WrapPayloadDetails(details)
   589  		ju.UpdatePayload(md.Payload)
   590  		return nil
   591  	})
   592  }
   593  
   594  // SetProgress sets the details field of the currently running tracked job.
   595  func (j *Job) SetProgress(ctx context.Context, details interface{}) error {
   596  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   597  		md.Progress.Details = jobspb.WrapProgressDetails(details)
   598  		ju.UpdateProgress(md.Progress)
   599  		return nil
   600  	})
   601  }
   602  
   603  // Payload returns the most recently sent Payload for this Job.
   604  func (j *Job) Payload() jobspb.Payload {
   605  	j.mu.Lock()
   606  	defer j.mu.Unlock()
   607  	return j.mu.payload
   608  }
   609  
   610  // Progress returns the most recently sent Progress for this Job.
   611  func (j *Job) Progress() jobspb.Progress {
   612  	j.mu.Lock()
   613  	defer j.mu.Unlock()
   614  	return j.mu.progress
   615  }
   616  
   617  // Details returns the details from the most recently sent Payload for this Job.
   618  func (j *Job) Details() jobspb.Details {
   619  	j.mu.Lock()
   620  	defer j.mu.Unlock()
   621  	return j.mu.payload.UnwrapDetails()
   622  }
   623  
   624  // FractionCompleted returns completion according to the in-memory job state.
   625  func (j *Job) FractionCompleted() float32 {
   626  	progress := j.Progress()
   627  	return progress.GetFractionCompleted()
   628  }
   629  
   630  // WithTxn sets the transaction that this Job will use for its next operation.
   631  // If the transaction is nil, the Job will create a one-off transaction instead.
   632  // If you use WithTxn, this Job will no longer be threadsafe.
   633  func (j *Job) WithTxn(txn *kv.Txn) *Job {
   634  	j.txn = txn
   635  	return j
   636  }
   637  
   638  // MakeSessionBoundInternalExecutor makes an internal executor, for use in a job
   639  // resumer, and sets it with the provided session data. See the comment on
   640  // sessionBoundInternalExecutorFactory for a more detailed explanation of why
   641  // this exists.
   642  func (j *Job) MakeSessionBoundInternalExecutor(
   643  	ctx context.Context, sd *sessiondata.SessionData,
   644  ) sqlutil.InternalExecutor {
   645  	return j.registry.sessionBoundInternalExecutorFactory(ctx, sd)
   646  }
   647  
   648  func (j *Job) runInTxn(ctx context.Context, fn func(context.Context, *kv.Txn) error) error {
   649  	if j.txn != nil {
   650  		defer func() { j.txn = nil }()
   651  		// Don't run fn in a retry loop because we need retryable errors to
   652  		// propagate up to the transaction's properly-scoped retry loop.
   653  		return fn(ctx, j.txn)
   654  	}
   655  	return j.registry.db.Txn(ctx, fn)
   656  }
   657  
   658  // JobNotFoundError is returned from load when the job does not exist.
   659  type JobNotFoundError struct {
   660  	jobID int64
   661  }
   662  
   663  // Error makes JobNotFoundError an error.
   664  func (e *JobNotFoundError) Error() string {
   665  	return fmt.Sprintf("job with ID %d does not exist", e.jobID)
   666  }
   667  
   668  // HasJobNotFoundError returns true if the error contains a JobNotFoundError.
   669  func HasJobNotFoundError(err error) bool {
   670  	return errors.HasType(err, (*JobNotFoundError)(nil))
   671  }
   672  
   673  func (j *Job) load(ctx context.Context) error {
   674  	var payload *jobspb.Payload
   675  	var progress *jobspb.Progress
   676  	if err := j.runInTxn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   677  		const stmt = "SELECT payload, progress FROM system.jobs WHERE id = $1"
   678  		row, err := j.registry.ex.QueryRowEx(
   679  			ctx, "load-job-query", txn, sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser},
   680  			stmt, *j.ID())
   681  		if err != nil {
   682  			return err
   683  		}
   684  		if row == nil {
   685  			return &JobNotFoundError{jobID: *j.ID()}
   686  		}
   687  		payload, err = UnmarshalPayload(row[0])
   688  		if err != nil {
   689  			return err
   690  		}
   691  		progress, err = UnmarshalProgress(row[1])
   692  		return err
   693  	}); err != nil {
   694  		return err
   695  	}
   696  	j.mu.payload = *payload
   697  	j.mu.progress = *progress
   698  	return nil
   699  }
   700  
   701  func (j *Job) insert(ctx context.Context, id int64, lease *jobspb.Lease) error {
   702  	if j.id != nil {
   703  		// Already created - do nothing.
   704  		return nil
   705  	}
   706  
   707  	j.mu.payload.Lease = lease
   708  
   709  	if err := j.runInTxn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   710  		// Note: although the following uses ReadTimestamp and
   711  		// ReadTimestamp can diverge from the value of now() throughout a
   712  		// transaction, this may be OK -- we merely required ModifiedMicro
   713  		// to be equal *or greater* than previously inserted timestamps
   714  		// computed by now(). For now ReadTimestamp can only move forward
   715  		// and the assertion ReadTimestamp >= now() holds at all times.
   716  		j.mu.progress.ModifiedMicros = timeutil.ToUnixMicros(txn.ReadTimestamp().GoTime())
   717  		payloadBytes, err := protoutil.Marshal(&j.mu.payload)
   718  		if err != nil {
   719  			return err
   720  		}
   721  		progressBytes, err := protoutil.Marshal(&j.mu.progress)
   722  		if err != nil {
   723  			return err
   724  		}
   725  
   726  		const stmt = "INSERT INTO system.jobs (id, status, payload, progress) VALUES ($1, $2, $3, $4)"
   727  		_, err = j.registry.ex.Exec(ctx, "job-insert", txn, stmt, id, StatusRunning, payloadBytes, progressBytes)
   728  		return err
   729  	}); err != nil {
   730  		return err
   731  	}
   732  	j.id = &id
   733  	return nil
   734  }
   735  
   736  func (j *Job) adopt(ctx context.Context, oldLease *jobspb.Lease) error {
   737  	return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error {
   738  		if !md.Payload.Lease.Equal(oldLease) {
   739  			return errors.Errorf("current lease %v did not match expected lease %v",
   740  				md.Payload.Lease, oldLease)
   741  		}
   742  		md.Payload.Lease = j.registry.newLease()
   743  		if md.Payload.StartedMicros == 0 {
   744  			md.Payload.StartedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime())
   745  		}
   746  		ju.UpdatePayload(md.Payload)
   747  		return nil
   748  	})
   749  }
   750  
   751  // UnmarshalPayload unmarshals and returns the Payload encoded in the input
   752  // datum, which should be a tree.DBytes.
   753  func UnmarshalPayload(datum tree.Datum) (*jobspb.Payload, error) {
   754  	payload := &jobspb.Payload{}
   755  	bytes, ok := datum.(*tree.DBytes)
   756  	if !ok {
   757  		return nil, errors.Errorf(
   758  			"Job: failed to unmarshal payload as DBytes (was %T)", datum)
   759  	}
   760  	if err := protoutil.Unmarshal([]byte(*bytes), payload); err != nil {
   761  		return nil, err
   762  	}
   763  	return payload, nil
   764  }
   765  
   766  // UnmarshalProgress unmarshals and returns the Progress encoded in the input
   767  // datum, which should be a tree.DBytes.
   768  func UnmarshalProgress(datum tree.Datum) (*jobspb.Progress, error) {
   769  	progress := &jobspb.Progress{}
   770  	bytes, ok := datum.(*tree.DBytes)
   771  	if !ok {
   772  		return nil, errors.Errorf(
   773  			"Job: failed to unmarshal Progress as DBytes (was %T)", datum)
   774  	}
   775  	if err := protoutil.Unmarshal([]byte(*bytes), progress); err != nil {
   776  		return nil, err
   777  	}
   778  	return progress, nil
   779  }
   780  
   781  // CurrentStatus returns the current job status from the jobs table or error.
   782  func (j *Job) CurrentStatus(ctx context.Context) (Status, error) {
   783  	if j.id == nil {
   784  		return "", errors.New("job has not been created")
   785  	}
   786  	var statusString tree.DString
   787  	if err := j.runInTxn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   788  		const selectStmt = "SELECT status FROM system.jobs WHERE id = $1"
   789  		row, err := j.registry.ex.QueryRow(ctx, "job-status", txn, selectStmt, *j.ID())
   790  		if err != nil {
   791  			return errors.Wrapf(err, "job %d: can't query system.jobs", *j.ID())
   792  		}
   793  		if row == nil {
   794  			return errors.Errorf("job %d: not found in system.jobs", *j.ID())
   795  		}
   796  
   797  		statusString = tree.MustBeDString(row[0])
   798  		return nil
   799  	}); err != nil {
   800  		return "", err
   801  	}
   802  	return Status(statusString), nil
   803  }
   804  
   805  // Start will resume the job. The transaction used to create the StartableJob
   806  // must be committed. If a non-nil error is returned, the job was not started
   807  // and nothing will be send on errCh. Clients must not start jobs more than
   808  // once.
   809  func (sj *StartableJob) Start(ctx context.Context) (errCh <-chan error, err error) {
   810  	if starts := atomic.AddInt64(&sj.starts, 1); starts != 1 {
   811  		return nil, errors.AssertionFailedf(
   812  			"StartableJob %d cannot be started more than once", *sj.ID())
   813  	}
   814  	defer func() {
   815  		if err != nil {
   816  			sj.registry.unregister(*sj.ID())
   817  		}
   818  	}()
   819  	if !sj.txn.IsCommitted() {
   820  		return nil, fmt.Errorf("cannot resume %T job which is not committed", sj.resumer)
   821  	}
   822  	if err := sj.started(ctx); err != nil {
   823  		return nil, err
   824  	}
   825  	errCh, err = sj.registry.resume(sj.resumerCtx, sj.resumer, sj.resultsCh, sj.Job)
   826  	if err != nil {
   827  		return nil, err
   828  	}
   829  	return errCh, nil
   830  }
   831  
   832  // Run will resume the job and wait for it to finish or the context to be
   833  // canceled. The transaction used to create the StartableJob must be committed.
   834  // Results will be copied to the channel used to create this StartableJob
   835  // even if job is canceled.
   836  func (sj *StartableJob) Run(ctx context.Context) error {
   837  	resultsFromJob := make(chan tree.Datums)
   838  	resultsCh := sj.resultsCh
   839  	sj.resultsCh = resultsFromJob
   840  	errCh, err := sj.Start(ctx)
   841  	if err != nil {
   842  		return err
   843  	}
   844  	jobCompletedOk := false
   845  
   846  	var r tree.Datums // stores a row if we've received one.
   847  	for {
   848  		// Alternate between receiving rows and sending them. Nil channels block.
   849  		var fromJob <-chan tree.Datums
   850  		var toClient chan<- tree.Datums
   851  		if r == nil {
   852  			fromJob = resultsFromJob
   853  		} else {
   854  			toClient = resultsCh
   855  		}
   856  		var ok bool
   857  		select {
   858  		case r, ok = <-fromJob:
   859  			// If the results channel is closed, set it to nil so that we don't
   860  			// loop infinitely. We still want to wait for the job to notify us on
   861  			// errCh.
   862  			if !ok {
   863  				close(resultsCh)
   864  				resultsCh, resultsFromJob = nil, nil
   865  			}
   866  		case toClient <- r:
   867  			r = nil
   868  			if jobCompletedOk {
   869  				return nil
   870  			}
   871  		case <-ctx.Done():
   872  			// Launch a goroutine to continue consuming results from the job.
   873  			if resultsFromJob != nil {
   874  				go sj.registry.stopper.RunWorker(ctx, func(ctx context.Context) {
   875  					for {
   876  						select {
   877  						case <-errCh:
   878  							return
   879  						case _, ok := <-resultsFromJob:
   880  							if !ok {
   881  								return
   882  							}
   883  						}
   884  					}
   885  				})
   886  			}
   887  			return ctx.Err()
   888  		case err := <-errCh:
   889  			// The job has completed, return its final error.
   890  			if err == nil && r != nil {
   891  				// We still have data to send to the client.
   892  				jobCompletedOk = true
   893  				continue
   894  			}
   895  			return err
   896  		}
   897  	}
   898  }
   899  
   900  // CleanupOnRollback will unregister the job in the case that the creating
   901  // transaction has been rolled back.
   902  func (sj *StartableJob) CleanupOnRollback(ctx context.Context) error {
   903  	if sj.txn.IsCommitted() {
   904  		return errors.AssertionFailedf(
   905  			"cannot call CleanupOnRollback for a StartableJob created by a committed transaction")
   906  	}
   907  	if !sj.txn.Sender().TxnStatus().IsFinalized() {
   908  		return errors.AssertionFailedf(
   909  			"cannot call CleanupOnRollback for a StartableJob with a non-finalized transaction")
   910  	}
   911  	sj.registry.unregister(*sj.ID())
   912  	return nil
   913  }
   914  
   915  // Cancel will mark the job as canceled and release its resources in the
   916  // Registry.
   917  func (sj *StartableJob) Cancel(ctx context.Context) error {
   918  	defer sj.registry.unregister(*sj.ID())
   919  	return sj.registry.CancelRequested(ctx, nil, *sj.ID())
   920  }