github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/jobs_test.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package jobs_test
    12  
    13  import (
    14  	"context"
    15  	gosql "database/sql"
    16  	"fmt"
    17  	"reflect"
    18  	"strings"
    19  	"sync/atomic"
    20  	"testing"
    21  	"time"
    22  
    23  	"github.com/cockroachdb/apd"
    24  	"github.com/cockroachdb/cockroach/pkg/base"
    25  	"github.com/cockroachdb/cockroach/pkg/jobs"
    26  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    27  	"github.com/cockroachdb/cockroach/pkg/kv"
    28  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    29  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    30  	"github.com/cockroachdb/cockroach/pkg/sql"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/tests"
    34  	"github.com/cockroachdb/cockroach/pkg/testutils"
    35  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    36  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    37  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    38  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    39  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    40  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    41  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    42  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    43  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    44  	"github.com/cockroachdb/errors"
    45  	"github.com/google/go-cmp/cmp"
    46  	"github.com/kr/pretty"
    47  	"github.com/stretchr/testify/assert"
    48  	"github.com/stretchr/testify/require"
    49  )
    50  
    51  // expectation defines the information necessary to determine the validity of
    52  // a job in the system.jobs table.
    53  type expectation struct {
    54  	DB                *gosql.DB
    55  	Record            jobs.Record
    56  	Type              jobspb.Type
    57  	Before            time.Time
    58  	FractionCompleted float32
    59  	Error             string
    60  }
    61  
    62  func (expected *expectation) verify(id *int64, expectedStatus jobs.Status) error {
    63  	var statusString string
    64  	var created time.Time
    65  	var payloadBytes []byte
    66  	var progressBytes []byte
    67  	if err := expected.DB.QueryRow(
    68  		`SELECT status, created, payload, progress FROM system.jobs WHERE id = $1`, id,
    69  	).Scan(
    70  		&statusString, &created, &payloadBytes, &progressBytes,
    71  	); err != nil {
    72  		return err
    73  	}
    74  
    75  	var payload jobspb.Payload
    76  	if err := protoutil.Unmarshal(payloadBytes, &payload); err != nil {
    77  		return err
    78  	}
    79  	var progress jobspb.Progress
    80  	if err := protoutil.Unmarshal(progressBytes, &progress); err != nil {
    81  		return err
    82  	}
    83  
    84  	// Verify the upstream-provided fields.
    85  	details := payload.UnwrapDetails()
    86  	progressDetail := progress.UnwrapDetails()
    87  
    88  	if e, a := expected.Record, (jobs.Record{
    89  		Description:   payload.Description,
    90  		Details:       details,
    91  		DescriptorIDs: payload.DescriptorIDs,
    92  		Username:      payload.Username,
    93  		Progress:      progressDetail,
    94  	}); !reflect.DeepEqual(e, a) {
    95  		diff := strings.Join(pretty.Diff(e, a), "\n")
    96  		return errors.Errorf("Records do not match:\n%s", diff)
    97  	}
    98  
    99  	// Verify internally-managed fields.
   100  	status := jobs.Status(statusString)
   101  	if e, a := expectedStatus, status; e != a {
   102  		return errors.Errorf("expected status %v, got %v", e, a)
   103  	}
   104  	if e, a := expected.Type, payload.Type(); e != a {
   105  		return errors.Errorf("expected type %v, got type %v", e, a)
   106  	}
   107  	if e, a := expected.FractionCompleted, progress.GetFractionCompleted(); e != a {
   108  		return errors.Errorf("expected fraction completed %f, got %f", e, a)
   109  	}
   110  
   111  	started := timeutil.FromUnixMicros(payload.StartedMicros)
   112  	if started.Equal(timeutil.UnixEpoch) && status == jobs.StatusSucceeded {
   113  		return errors.Errorf("started time is empty but job claims to be successful")
   114  	}
   115  	if status == jobs.StatusRunning || status == jobs.StatusPauseRequested {
   116  		return nil
   117  	}
   118  
   119  	if e, a := expected.Error, payload.Error; e != a {
   120  		return errors.Errorf("expected error %q, got %q", e, a)
   121  	}
   122  	return nil
   123  }
   124  
   125  func TestJobsTableProgressFamily(t *testing.T) {
   126  	defer leaktest.AfterTest(t)()
   127  
   128  	ctx := context.Background()
   129  	s, db, _ := serverutils.StartServer(t, base.TestServerArgs{})
   130  	defer s.Stopper().Stop(ctx)
   131  
   132  	var table, schema string
   133  	sqlutils.MakeSQLRunner(db).QueryRow(t, `SHOW CREATE system.jobs`).Scan(&table, &schema)
   134  	if !strings.Contains(schema, `FAMILY progress (progress)`) {
   135  		t.Fatalf("expected progress family, got %q", schema)
   136  	}
   137  }
   138  
   139  type counters struct {
   140  	ResumeExit, OnFailOrCancelExit int
   141  	// These sometimes retry so just use bool.
   142  	ResumeStart, OnFailOrCancelStart, Success bool
   143  }
   144  
   145  type registryTestSuite struct {
   146  	ctx         context.Context
   147  	oldInterval time.Duration
   148  	s           serverutils.TestServerInterface
   149  	outerDB     *gosql.DB
   150  	sqlDB       *sqlutils.SQLRunner
   151  	registry    *jobs.Registry
   152  	done        chan struct{}
   153  	mockJob     jobs.Record
   154  	job         *jobs.Job
   155  	mu          struct {
   156  		syncutil.Mutex
   157  		a counters
   158  		e counters
   159  	}
   160  	resumeCh            chan error
   161  	progressCh          chan struct{}
   162  	failOrCancelCh      chan error
   163  	resumeCheckCh       chan struct{}
   164  	failOrCancelCheckCh chan struct{}
   165  	onPauseRequest      jobs.OnPauseRequestFunc
   166  	// Instead of a ch for success, use a variable because it can retry since it
   167  	// is in a transaction.
   168  	successErr error
   169  }
   170  
   171  func noopPauseRequestFunc(
   172  	ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress,
   173  ) error {
   174  	return nil
   175  }
   176  
   177  func (rts *registryTestSuite) setUp(t *testing.T) {
   178  	rts.oldInterval = jobs.DefaultAdoptInterval
   179  	jobs.DefaultAdoptInterval = time.Millisecond
   180  	rts.ctx = context.Background()
   181  	rts.s, rts.outerDB, _ = serverutils.StartServer(t, base.TestServerArgs{})
   182  	rts.sqlDB = sqlutils.MakeSQLRunner(rts.outerDB)
   183  	rts.registry = rts.s.JobRegistry().(*jobs.Registry)
   184  	rts.done = make(chan struct{})
   185  	rts.mockJob = jobs.Record{Details: jobspb.ImportDetails{}, Progress: jobspb.ImportProgress{}}
   186  
   187  	rts.resumeCh = make(chan error)
   188  	rts.progressCh = make(chan struct{})
   189  	rts.failOrCancelCh = make(chan error)
   190  	rts.resumeCheckCh = make(chan struct{})
   191  	rts.failOrCancelCheckCh = make(chan struct{})
   192  	rts.onPauseRequest = noopPauseRequestFunc
   193  
   194  	jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
   195  		return jobs.FakeResumer{
   196  			OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
   197  				t.Log("Starting resume")
   198  				rts.mu.Lock()
   199  				rts.mu.a.ResumeStart = true
   200  				rts.mu.Unlock()
   201  				defer func() {
   202  					rts.mu.Lock()
   203  					rts.mu.a.ResumeExit++
   204  					rts.mu.Unlock()
   205  					t.Log("Exiting resume")
   206  				}()
   207  				for {
   208  					<-rts.resumeCheckCh
   209  					select {
   210  					case <-ctx.Done():
   211  						rts.mu.Lock()
   212  						rts.mu.a.ResumeExit--
   213  						rts.mu.Unlock()
   214  						return ctx.Err()
   215  					case err := <-rts.resumeCh:
   216  						return err
   217  					case <-rts.progressCh:
   218  						err := job.FractionProgressed(rts.ctx, jobs.FractionUpdater(0))
   219  						if err != nil {
   220  							return err
   221  						}
   222  					}
   223  				}
   224  			},
   225  			FailOrCancel: func(ctx context.Context) error {
   226  				t.Log("Starting FailOrCancel")
   227  				rts.mu.Lock()
   228  				rts.mu.a.OnFailOrCancelStart = true
   229  				rts.mu.Unlock()
   230  				defer func() {
   231  					rts.mu.Lock()
   232  					rts.mu.a.OnFailOrCancelExit++
   233  					rts.mu.Unlock()
   234  					t.Log("Exiting OnFailOrCancel")
   235  				}()
   236  				<-rts.failOrCancelCheckCh
   237  				select {
   238  				case <-ctx.Done():
   239  					rts.mu.Lock()
   240  					rts.mu.a.OnFailOrCancelExit--
   241  					rts.mu.Unlock()
   242  					return ctx.Err()
   243  				case err := <-rts.failOrCancelCh:
   244  					return err
   245  				}
   246  			},
   247  
   248  			Success: func() error {
   249  				t.Log("Starting success")
   250  				rts.mu.Lock()
   251  				defer func() {
   252  					rts.mu.Unlock()
   253  					t.Log("Exiting success")
   254  				}()
   255  				rts.mu.a.Success = true
   256  				return rts.successErr
   257  			},
   258  			PauseRequest: func(ctx context.Context, execCfg interface{}, txn *kv.Txn, progress *jobspb.Progress) error {
   259  				return rts.onPauseRequest(ctx, execCfg, txn, progress)
   260  			},
   261  		}
   262  	})
   263  }
   264  
   265  func (rts *registryTestSuite) tearDown() {
   266  	close(rts.resumeCh)
   267  	close(rts.progressCh)
   268  	close(rts.resumeCheckCh)
   269  	close(rts.failOrCancelCh)
   270  	close(rts.failOrCancelCheckCh)
   271  	close(rts.done)
   272  	rts.s.Stopper().Stop(rts.ctx)
   273  	jobs.DefaultAdoptInterval = rts.oldInterval
   274  	jobs.ResetConstructors()()
   275  }
   276  
   277  func (rts *registryTestSuite) check(t *testing.T, expectedStatus jobs.Status) {
   278  	t.Helper()
   279  	opts := retry.Options{
   280  		InitialBackoff: 5 * time.Millisecond,
   281  		MaxBackoff:     time.Second,
   282  		Multiplier:     2,
   283  	}
   284  	if err := retry.WithMaxAttempts(rts.ctx, opts, 10, func() error {
   285  		rts.mu.Lock()
   286  		defer rts.mu.Unlock()
   287  		if diff := cmp.Diff(rts.mu.e, rts.mu.a); diff != "" {
   288  			return errors.Errorf("unexpected diff: %s", diff)
   289  		}
   290  		if expectedStatus == "" {
   291  			return nil
   292  		}
   293  		st, err := rts.job.CurrentStatus(rts.ctx)
   294  		if err != nil {
   295  			return err
   296  		}
   297  		if expectedStatus != st {
   298  			return errors.Errorf("expected job status: %s but got: %s", expectedStatus, st)
   299  		}
   300  		return nil
   301  	}); err != nil {
   302  		t.Fatal(err)
   303  	}
   304  }
   305  
   306  func TestRegistryLifecycle(t *testing.T) {
   307  	defer leaktest.AfterTest(t)()
   308  	t.Run("normal success", func(t *testing.T) {
   309  		rts := registryTestSuite{}
   310  		rts.setUp(t)
   311  		defer rts.tearDown()
   312  
   313  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   314  		if err != nil {
   315  			t.Fatal(err)
   316  		}
   317  		rts.job = j
   318  
   319  		rts.mu.e.ResumeStart = true
   320  		rts.resumeCheckCh <- struct{}{}
   321  		rts.check(t, jobs.StatusRunning)
   322  
   323  		rts.resumeCh <- nil
   324  		rts.mu.e.ResumeExit++
   325  		rts.mu.e.Success = true
   326  		rts.check(t, jobs.StatusSucceeded)
   327  	})
   328  
   329  	t.Run("create separately success", func(t *testing.T) {
   330  		rts := registryTestSuite{}
   331  		rts.setUp(t)
   332  		defer rts.tearDown()
   333  
   334  		j, err := rts.registry.CreateJobWithTxn(rts.ctx, rts.mockJob, nil)
   335  		if err != nil {
   336  			t.Fatal(err)
   337  		}
   338  		rts.job = j
   339  
   340  		rts.mu.e.ResumeStart = true
   341  		rts.check(t, jobs.StatusRunning)
   342  
   343  		rts.resumeCheckCh <- struct{}{}
   344  		rts.resumeCh <- nil
   345  		rts.mu.e.ResumeExit++
   346  		rts.mu.e.Success = true
   347  		rts.check(t, jobs.StatusSucceeded)
   348  	})
   349  
   350  	t.Run("pause running", func(t *testing.T) {
   351  		rts := registryTestSuite{}
   352  		rts.setUp(t)
   353  		defer rts.tearDown()
   354  
   355  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   356  		if err != nil {
   357  			t.Fatal(err)
   358  		}
   359  		rts.job = j
   360  
   361  		rts.mu.e.ResumeStart = true
   362  		rts.resumeCheckCh <- struct{}{}
   363  		rts.check(t, jobs.StatusRunning)
   364  
   365  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID())
   366  		rts.check(t, jobs.StatusPaused)
   367  
   368  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID())
   369  		rts.check(t, jobs.StatusPaused)
   370  
   371  		rts.sqlDB.Exec(t, "RESUME JOB $1", *j.ID())
   372  		rts.check(t, jobs.StatusRunning)
   373  
   374  		rts.mu.e.ResumeStart = true
   375  		rts.resumeCheckCh <- struct{}{}
   376  		rts.check(t, jobs.StatusRunning)
   377  		rts.resumeCh <- nil
   378  		rts.mu.e.ResumeExit++
   379  
   380  		rts.mu.e.Success = true
   381  		rts.check(t, jobs.StatusSucceeded)
   382  	})
   383  
   384  	t.Run("pause reverting", func(t *testing.T) {
   385  		rts := registryTestSuite{}
   386  		rts.setUp(t)
   387  		defer rts.tearDown()
   388  
   389  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   390  		if err != nil {
   391  			t.Fatal(err)
   392  		}
   393  		rts.job = j
   394  
   395  		rts.mu.e.ResumeStart = true
   396  		rts.resumeCheckCh <- struct{}{}
   397  		rts.check(t, jobs.StatusRunning)
   398  
   399  		// Make Resume fail.
   400  		rts.resumeCh <- errors.New("resume failed")
   401  		rts.mu.e.ResumeExit++
   402  		rts.mu.e.OnFailOrCancelStart = true
   403  		rts.failOrCancelCheckCh <- struct{}{}
   404  		rts.check(t, jobs.StatusReverting)
   405  
   406  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID())
   407  		rts.check(t, jobs.StatusPaused)
   408  
   409  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID())
   410  		rts.check(t, jobs.StatusPaused)
   411  
   412  		rts.sqlDB.Exec(t, "RESUME JOB $1", *j.ID())
   413  		rts.failOrCancelCheckCh <- struct{}{}
   414  		rts.check(t, jobs.StatusReverting)
   415  
   416  		rts.failOrCancelCh <- nil
   417  		rts.mu.e.OnFailOrCancelExit++
   418  		rts.check(t, jobs.StatusFailed)
   419  	})
   420  
   421  	t.Run("cancel running", func(t *testing.T) {
   422  		rts := registryTestSuite{}
   423  		rts.setUp(t)
   424  		defer rts.tearDown()
   425  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   426  		if err != nil {
   427  			t.Fatal(err)
   428  		}
   429  		rts.job = j
   430  
   431  		rts.mu.e.ResumeStart = true
   432  		rts.resumeCheckCh <- struct{}{}
   433  		rts.check(t, jobs.StatusRunning)
   434  
   435  		rts.sqlDB.Exec(t, "CANCEL JOB $1", *j.ID())
   436  		rts.mu.e.OnFailOrCancelStart = true
   437  		rts.check(t, jobs.StatusReverting)
   438  
   439  		rts.failOrCancelCheckCh <- struct{}{}
   440  		rts.failOrCancelCh <- nil
   441  		rts.mu.e.OnFailOrCancelExit++
   442  
   443  		rts.check(t, jobs.StatusCanceled)
   444  	})
   445  
   446  	t.Run("cancel reverting", func(t *testing.T) {
   447  		rts := registryTestSuite{}
   448  		rts.setUp(t)
   449  		defer rts.tearDown()
   450  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   451  		if err != nil {
   452  			t.Fatal(err)
   453  		}
   454  		rts.job = j
   455  
   456  		rts.mu.e.ResumeStart = true
   457  		rts.resumeCheckCh <- struct{}{}
   458  		rts.check(t, jobs.StatusRunning)
   459  
   460  		rts.sqlDB.Exec(t, "CANCEL JOB $1", *j.ID())
   461  		rts.mu.e.OnFailOrCancelStart = true
   462  		rts.check(t, jobs.StatusReverting)
   463  
   464  		rts.sqlDB.ExpectErr(t, "status reverting cannot be requested to be canceled", "CANCEL JOB $1", *j.ID())
   465  		rts.check(t, jobs.StatusReverting)
   466  	})
   467  
   468  	t.Run("cancel pause running", func(t *testing.T) {
   469  		rts := registryTestSuite{}
   470  		rts.setUp(t)
   471  		defer rts.tearDown()
   472  
   473  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   474  		if err != nil {
   475  			t.Fatal(err)
   476  		}
   477  		rts.job = j
   478  
   479  		rts.mu.e.ResumeStart = true
   480  		rts.resumeCheckCh <- struct{}{}
   481  		rts.check(t, jobs.StatusRunning)
   482  
   483  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID())
   484  		rts.check(t, jobs.StatusPaused)
   485  
   486  		rts.sqlDB.Exec(t, "CANCEL JOB $1", *j.ID())
   487  		rts.mu.e.OnFailOrCancelStart = true
   488  		rts.failOrCancelCheckCh <- struct{}{}
   489  		rts.check(t, jobs.StatusReverting)
   490  
   491  		rts.failOrCancelCh <- nil
   492  		rts.mu.e.OnFailOrCancelExit++
   493  		rts.check(t, jobs.StatusCanceled)
   494  	})
   495  
   496  	t.Run("cancel pause reverting", func(t *testing.T) {
   497  		rts := registryTestSuite{}
   498  		rts.setUp(t)
   499  		defer rts.tearDown()
   500  
   501  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   502  		if err != nil {
   503  			t.Fatal(err)
   504  		}
   505  		rts.job = j
   506  
   507  		rts.mu.e.ResumeStart = true
   508  		rts.resumeCheckCh <- struct{}{}
   509  		rts.check(t, jobs.StatusRunning)
   510  
   511  		// Make Resume fail.
   512  		rts.resumeCh <- errors.New("resume failed")
   513  		rts.mu.e.ResumeExit++
   514  		rts.mu.e.OnFailOrCancelStart = true
   515  		rts.failOrCancelCheckCh <- struct{}{}
   516  		rts.check(t, jobs.StatusReverting)
   517  
   518  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID())
   519  		rts.check(t, jobs.StatusPaused)
   520  
   521  		rts.sqlDB.ExpectErr(t, "paused and has non-nil FinalResumeError resume", "CANCEL JOB $1", *j.ID())
   522  		rts.check(t, jobs.StatusPaused)
   523  
   524  		rts.sqlDB.Exec(t, "RESUME JOB $1", *j.ID())
   525  		rts.failOrCancelCheckCh <- struct{}{}
   526  		rts.check(t, jobs.StatusReverting)
   527  
   528  		rts.failOrCancelCh <- nil
   529  		rts.mu.e.OnFailOrCancelExit++
   530  		rts.check(t, jobs.StatusFailed)
   531  	})
   532  
   533  	// Verify that pause and cancel in a rollback do nothing.
   534  	t.Run("rollback", func(t *testing.T) {
   535  		rts := registryTestSuite{}
   536  		rts.setUp(t)
   537  		defer rts.tearDown()
   538  		job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   539  		if err != nil {
   540  			t.Fatal(err)
   541  		}
   542  		rts.job = job
   543  
   544  		rts.mu.e.ResumeStart = true
   545  		rts.resumeCheckCh <- struct{}{}
   546  		rts.check(t, jobs.StatusRunning)
   547  
   548  		// Rollback a CANCEL.
   549  		{
   550  			txn, err := rts.outerDB.Begin()
   551  			if err != nil {
   552  				t.Fatal(err)
   553  			}
   554  			// OnFailOrCancel is *not* called in the same txn as the job is marked
   555  			// cancel-requested and it will only be called when the job is adopted
   556  			// again.
   557  			if _, err := txn.Exec("CANCEL JOB $1", *job.ID()); err != nil {
   558  				t.Fatal(err)
   559  			}
   560  			if err := txn.Rollback(); err != nil {
   561  				t.Fatal(err)
   562  			}
   563  			rts.progressCh <- struct{}{}
   564  			rts.resumeCheckCh <- struct{}{}
   565  			rts.check(t, jobs.StatusRunning)
   566  		}
   567  		// Rollback a PAUSE.
   568  		{
   569  			txn, err := rts.outerDB.Begin()
   570  			if err != nil {
   571  				t.Fatal(err)
   572  			}
   573  			if _, err := txn.Exec("PAUSE JOB $1", *job.ID()); err != nil {
   574  				t.Fatal(err)
   575  			}
   576  			if err := txn.Rollback(); err != nil {
   577  				t.Fatal(err)
   578  			}
   579  			rts.progressCh <- struct{}{}
   580  			rts.resumeCheckCh <- struct{}{}
   581  			rts.check(t, jobs.StatusRunning)
   582  		}
   583  		// Now pause it for reals.
   584  		{
   585  			txn, err := rts.outerDB.Begin()
   586  			if err != nil {
   587  				t.Fatal(err)
   588  			}
   589  			if _, err := txn.Exec("PAUSE JOB $1", *job.ID()); err != nil {
   590  				t.Fatal(err)
   591  			}
   592  			// Not committed yet, so state shouldn't have changed.
   593  			// Don't check status in txn.
   594  			rts.check(t, "")
   595  			if err := txn.Commit(); err != nil {
   596  				t.Fatal(err)
   597  			}
   598  			rts.check(t, jobs.StatusPaused)
   599  		}
   600  		// Rollback a RESUME.
   601  		{
   602  			txn, err := rts.outerDB.Begin()
   603  			if err != nil {
   604  				t.Fatal(err)
   605  			}
   606  			if _, err := txn.Exec("RESUME JOB $1", *job.ID()); err != nil {
   607  				t.Fatal(err)
   608  			}
   609  			if err := txn.Rollback(); err != nil {
   610  				t.Fatal(err)
   611  			}
   612  			rts.check(t, jobs.StatusPaused)
   613  		}
   614  		// Commit a RESUME.
   615  		{
   616  			txn, err := rts.outerDB.Begin()
   617  			if err != nil {
   618  				t.Fatal(err)
   619  			}
   620  			if _, err := txn.Exec("RESUME JOB $1", *job.ID()); err != nil {
   621  				t.Fatal(err)
   622  			}
   623  			// Not committed yet, so state shouldn't have changed.
   624  			// Don't check status in txn.
   625  			rts.check(t, "")
   626  			if err := txn.Commit(); err != nil {
   627  				t.Fatal(err)
   628  			}
   629  		}
   630  		rts.mu.e.ResumeStart = true
   631  		rts.check(t, jobs.StatusRunning)
   632  		rts.resumeCheckCh <- struct{}{}
   633  		rts.resumeCh <- nil
   634  		rts.mu.e.ResumeExit++
   635  		rts.mu.e.Success = true
   636  		rts.check(t, jobs.StatusSucceeded)
   637  	})
   638  
   639  	t.Run("failed running", func(t *testing.T) {
   640  		rts := registryTestSuite{}
   641  		rts.setUp(t)
   642  		defer rts.tearDown()
   643  
   644  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   645  		if err != nil {
   646  			t.Fatal(err)
   647  		}
   648  		rts.job = j
   649  
   650  		rts.mu.e.ResumeStart = true
   651  		rts.check(t, jobs.StatusRunning)
   652  
   653  		rts.resumeCheckCh <- struct{}{}
   654  		rts.resumeCh <- errors.New("resume failed")
   655  		rts.mu.e.ResumeExit++
   656  		rts.mu.e.OnFailOrCancelStart = true
   657  		rts.failOrCancelCheckCh <- struct{}{}
   658  		rts.check(t, jobs.StatusReverting)
   659  
   660  		rts.failOrCancelCh <- nil
   661  		rts.mu.e.OnFailOrCancelExit++
   662  		rts.check(t, jobs.StatusFailed)
   663  	})
   664  
   665  	// Attempt to mark success, but fail, but fail that also.
   666  	t.Run("fail marking success and fail OnFailOrCancel", func(t *testing.T) {
   667  		rts := registryTestSuite{}
   668  		rts.setUp(t)
   669  		defer rts.tearDown()
   670  
   671  		// Make marking success fail.
   672  		rts.successErr = errors.New("resume failed")
   673  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   674  		if err != nil {
   675  			t.Fatal(err)
   676  		}
   677  		rts.job = j
   678  
   679  		rts.mu.e.ResumeStart = true
   680  		rts.resumeCheckCh <- struct{}{}
   681  		rts.check(t, jobs.StatusRunning)
   682  
   683  		rts.resumeCh <- nil
   684  		rts.mu.e.ResumeExit++
   685  		rts.mu.e.Success = true
   686  		rts.mu.e.OnFailOrCancelStart = true
   687  
   688  		// The job is now in state reverting and will never resume again because
   689  		// OnFailOrCancel also fails.
   690  		rts.check(t, jobs.StatusReverting)
   691  		rts.failOrCancelCheckCh <- struct{}{}
   692  		rts.mu.e.OnFailOrCancelExit++
   693  		rts.failOrCancelCh <- errors.New("reverting failed")
   694  		rts.check(t, jobs.StatusFailed)
   695  	})
   696  
   697  	// Fail the job, but also fail to mark it failed.
   698  	t.Run("fail marking failed", func(t *testing.T) {
   699  		rts := registryTestSuite{}
   700  		rts.setUp(t)
   701  		defer rts.tearDown()
   702  
   703  		// Make marking success fail.
   704  		rts.successErr = errors.New("resume failed")
   705  		j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   706  		if err != nil {
   707  			t.Fatal(err)
   708  		}
   709  		rts.job = j
   710  
   711  		rts.mu.e.ResumeStart = true
   712  		rts.resumeCheckCh <- struct{}{}
   713  		rts.check(t, jobs.StatusRunning)
   714  
   715  		rts.resumeCh <- errors.New("resume failed")
   716  		rts.mu.e.ResumeExit++
   717  		rts.mu.e.OnFailOrCancelStart = true
   718  		rts.failOrCancelCheckCh <- struct{}{}
   719  		// The job is now in state reverting and will never resume again.
   720  		rts.check(t, jobs.StatusReverting)
   721  
   722  		// But let it fail.
   723  		rts.mu.e.OnFailOrCancelExit++
   724  		rts.failOrCancelCh <- errors.New("resume failed")
   725  		rts.check(t, jobs.StatusFailed)
   726  	})
   727  
   728  	t.Run("fail 2.0 jobs with no progress", func(t *testing.T) {
   729  		rts := registryTestSuite{}
   730  		rts.setUp(t)
   731  		defer rts.tearDown()
   732  		job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   733  		if err != nil {
   734  			t.Fatal(err)
   735  		}
   736  		rts.job = job
   737  		rts.mu.e.ResumeStart = true
   738  		rts.check(t, jobs.StatusRunning)
   739  		rts.sqlDB.Exec(t, "PAUSE JOB $1", *job.ID())
   740  		rts.sqlDB.Exec(t, `UPDATE system.jobs SET progress = NULL, status = $2 WHERE id = $1`, *job.ID(), jobs.StatusRunning)
   741  		testutils.SucceedsSoon(t, func() error {
   742  			var status jobs.Status
   743  			var err string
   744  			rts.sqlDB.QueryRow(t, `SELECT error, status FROM [SHOW JOBS] WHERE job_id = $1`, *job.ID()).Scan(&err, &status)
   745  			if status != jobs.StatusFailed {
   746  				return errors.Errorf("unexpected status: %s", status)
   747  			}
   748  			if err != "job predates cluster upgrade and must be re-run" {
   749  				return errors.Errorf("unexpected error: %s", err)
   750  			}
   751  			return nil
   752  		})
   753  		rts.check(t, jobs.StatusFailed)
   754  	})
   755  
   756  	t.Run("OnPauseRequest", func(t *testing.T) {
   757  		rts := registryTestSuite{}
   758  		rts.setUp(t)
   759  		defer rts.tearDown()
   760  		madeUpSpans := []roachpb.Span{
   761  			{Key: roachpb.Key("foo")},
   762  		}
   763  		rts.onPauseRequest = func(ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress) error {
   764  			progress.GetImport().SpanProgress = madeUpSpans
   765  			return nil
   766  		}
   767  
   768  		job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   769  		require.NoError(t, err)
   770  		rts.job = job
   771  
   772  		rts.resumeCheckCh <- struct{}{}
   773  		rts.mu.e.ResumeStart = true
   774  		rts.check(t, jobs.StatusRunning)
   775  
   776  		// Request that the job is paused.
   777  		pauseErrCh := make(chan error)
   778  		go func() {
   779  			_, err := rts.outerDB.Exec("PAUSE JOB $1", *job.ID())
   780  			pauseErrCh <- err
   781  		}()
   782  
   783  		// Ensure that the pause went off without a problem.
   784  		require.NoError(t, <-pauseErrCh)
   785  		rts.check(t, jobs.StatusPaused)
   786  		{
   787  			// Make sure the side-effects of our pause function occurred.
   788  			j, err := rts.registry.LoadJob(rts.ctx, *job.ID())
   789  			require.NoError(t, err)
   790  			progress := j.Progress()
   791  			require.Equal(t, madeUpSpans, progress.GetImport().SpanProgress)
   792  		}
   793  	})
   794  	t.Run("OnPauseRequest failure does not pause", func(t *testing.T) {
   795  		rts := registryTestSuite{}
   796  		rts.setUp(t)
   797  		defer rts.tearDown()
   798  
   799  		rts.onPauseRequest = func(ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress) error {
   800  			return errors.New("boom")
   801  		}
   802  
   803  		job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob)
   804  		require.NoError(t, err)
   805  		rts.job = job
   806  
   807  		// Allow the job to start.
   808  		rts.resumeCheckCh <- struct{}{}
   809  		rts.mu.e.ResumeStart = true
   810  		rts.check(t, jobs.StatusRunning)
   811  
   812  		// Request that the job is paused and ensure that the pause hit the error
   813  		// and failed to pause.
   814  		_, err = rts.outerDB.Exec("PAUSE JOB $1", *job.ID())
   815  		require.Regexp(t, "boom", err)
   816  
   817  		// Allow the job to complete.
   818  		rts.resumeCh <- nil
   819  		rts.mu.e.Success = true
   820  		rts.mu.e.ResumeExit++
   821  		rts.check(t, jobs.StatusSucceeded)
   822  	})
   823  }
   824  
   825  func TestJobLifecycle(t *testing.T) {
   826  	defer leaktest.AfterTest(t)()
   827  	defer jobs.ResetConstructors()()
   828  
   829  	ctx := context.Background()
   830  
   831  	s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{})
   832  	defer s.Stopper().Stop(ctx)
   833  
   834  	registry := s.JobRegistry().(*jobs.Registry)
   835  
   836  	createJob := func(record jobs.Record) (*jobs.Job, expectation) {
   837  		beforeTime := timeutil.Now()
   838  		job := registry.NewJob(record)
   839  		if err := job.Created(ctx); err != nil {
   840  			t.Fatal(err)
   841  		}
   842  		payload := job.Payload()
   843  		return job, expectation{
   844  			DB:     sqlDB,
   845  			Record: record,
   846  			Type:   payload.Type(),
   847  			Before: beforeTime,
   848  		}
   849  	}
   850  
   851  	defaultRecord := jobs.Record{
   852  		// Job does not accept an empty Details field, so arbitrarily provide
   853  		// ImportDetails.
   854  		Details:  jobspb.ImportDetails{},
   855  		Progress: jobspb.ImportProgress{},
   856  	}
   857  
   858  	createDefaultJob := func() (*jobs.Job, expectation) {
   859  		return createJob(defaultRecord)
   860  	}
   861  
   862  	done := make(chan struct{})
   863  	defer close(done)
   864  
   865  	jobs.RegisterConstructor(jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer {
   866  		return jobs.FakeResumer{
   867  			OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
   868  				select {
   869  				case <-ctx.Done():
   870  					return ctx.Err()
   871  				case <-done:
   872  					return nil
   873  				}
   874  			},
   875  		}
   876  	})
   877  
   878  	startLeasedJob := func(t *testing.T, record jobs.Record) (*jobs.Job, expectation) {
   879  		beforeTime := timeutil.Now()
   880  		job, _, err := registry.CreateAndStartJob(ctx, nil, record)
   881  		if err != nil {
   882  			t.Fatal(err)
   883  		}
   884  		payload := job.Payload()
   885  		return job, expectation{
   886  			DB:     sqlDB,
   887  			Record: record,
   888  			Type:   payload.Type(),
   889  			Before: beforeTime,
   890  		}
   891  	}
   892  
   893  	t.Run("valid job lifecycles succeed", func(t *testing.T) {
   894  		// Woody is a successful job.
   895  		woodyJob, woodyExp := createJob(jobs.Record{
   896  			Description:   "There's a snake in my boot!",
   897  			Username:      "Woody Pride",
   898  			DescriptorIDs: []sqlbase.ID{1, 2, 3},
   899  			Details:       jobspb.RestoreDetails{},
   900  			Progress:      jobspb.RestoreProgress{},
   901  		})
   902  
   903  		if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil {
   904  			t.Fatal(err)
   905  		}
   906  
   907  		if err := woodyJob.Started(ctx); err != nil {
   908  			t.Fatal(err)
   909  		}
   910  		if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil {
   911  			t.Fatal(err)
   912  		}
   913  
   914  		// This fraction completed progression tests that calling Progressed with a
   915  		// fractionCompleted that is less than the last-recorded fractionCompleted
   916  		// is observed.
   917  		progresses := []struct {
   918  			actual   float32
   919  			expected float32
   920  		}{
   921  			{0.0, 0.0}, {0.5, 0.5}, {0.5, 0.5}, {0.4, 0.4}, {0.8, 0.8}, {1.0, 1.0},
   922  		}
   923  		for _, f := range progresses {
   924  			if err := woodyJob.FractionProgressed(ctx, jobs.FractionUpdater(f.actual)); err != nil {
   925  				t.Fatal(err)
   926  			}
   927  			woodyExp.FractionCompleted = f.expected
   928  			if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil {
   929  				t.Fatal(err)
   930  			}
   931  		}
   932  
   933  		// Test Progressed callbacks.
   934  		if err := woodyJob.FractionProgressed(ctx, func(_ context.Context, details jobspb.ProgressDetails) float32 {
   935  			details.(*jobspb.Progress_Restore).Restore.HighWater = roachpb.Key("mariana")
   936  			return 1.0
   937  		}); err != nil {
   938  			t.Fatal(err)
   939  		}
   940  		woodyExp.Record.Progress = jobspb.RestoreProgress{HighWater: roachpb.Key("mariana")}
   941  		if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil {
   942  			t.Fatal(err)
   943  		}
   944  
   945  		if err := woodyJob.Succeeded(ctx); err != nil {
   946  			t.Fatal(err)
   947  		}
   948  		if err := woodyExp.verify(woodyJob.ID(), jobs.StatusSucceeded); err != nil {
   949  			t.Fatal(err)
   950  		}
   951  
   952  		// Buzz fails after it starts running.
   953  		buzzRecord := jobs.Record{
   954  			Description:   "To infinity and beyond!",
   955  			Username:      "Buzz Lightyear",
   956  			DescriptorIDs: []sqlbase.ID{3, 2, 1},
   957  			Details:       jobspb.BackupDetails{},
   958  			Progress:      jobspb.BackupProgress{},
   959  		}
   960  		buzzExp := expectation{
   961  			DB:     sqlDB,
   962  			Record: buzzRecord,
   963  			Type:   jobspb.TypeBackup,
   964  			Before: timeutil.Now(),
   965  			Error:  "Buzz Lightyear can't fly",
   966  		}
   967  		buzzJob := registry.NewJob(buzzRecord)
   968  
   969  		if err := buzzJob.Created(ctx); err != nil {
   970  			t.Fatal(err)
   971  		}
   972  		if err := buzzExp.verify(buzzJob.ID(), jobs.StatusRunning); err != nil {
   973  			t.Fatal(err)
   974  		}
   975  
   976  		if err := buzzJob.Started(ctx); err != nil {
   977  			t.Fatal(err)
   978  		}
   979  		if err := buzzExp.verify(buzzJob.ID(), jobs.StatusRunning); err != nil {
   980  			t.Fatal(err)
   981  		}
   982  
   983  		if err := buzzJob.FractionProgressed(ctx, jobs.FractionUpdater(.42)); err != nil {
   984  			t.Fatal(err)
   985  		}
   986  		buzzExp.FractionCompleted = .42
   987  		if err := buzzExp.verify(buzzJob.ID(), jobs.StatusRunning); err != nil {
   988  			t.Fatal(err)
   989  		}
   990  
   991  		if err := buzzJob.Failed(ctx, errors.New("Buzz Lightyear can't fly")); err != nil {
   992  			t.Fatal(err)
   993  		}
   994  		if err := buzzExp.verify(buzzJob.ID(), jobs.StatusFailed); err != nil {
   995  			t.Fatal(err)
   996  		}
   997  
   998  		// Ensure that logging Buzz didn't corrupt Woody.
   999  		if err := woodyExp.verify(woodyJob.ID(), jobs.StatusSucceeded); err != nil {
  1000  			t.Fatal(err)
  1001  		}
  1002  
  1003  		// Sid fails before it starts running.
  1004  		sidJob, sidExp := createJob(jobs.Record{
  1005  			Description:   "The toys! The toys are alive!",
  1006  			Username:      "Sid Phillips",
  1007  			DescriptorIDs: []sqlbase.ID{6, 6, 6},
  1008  			Details:       jobspb.RestoreDetails{},
  1009  			Progress:      jobspb.RestoreProgress{},
  1010  		})
  1011  
  1012  		if err := sidExp.verify(sidJob.ID(), jobs.StatusRunning); err != nil {
  1013  			t.Fatal(err)
  1014  		}
  1015  
  1016  		if err := sidJob.Failed(ctx, errors.New("Sid is a total failure")); err != nil {
  1017  			t.Fatal(err)
  1018  		}
  1019  		sidExp.Error = "Sid is a total failure"
  1020  		if err := sidExp.verify(sidJob.ID(), jobs.StatusFailed); err != nil {
  1021  			t.Fatal(err)
  1022  		}
  1023  
  1024  		// Ensure that logging Sid didn't corrupt Woody or Buzz.
  1025  		if err := woodyExp.verify(woodyJob.ID(), jobs.StatusSucceeded); err != nil {
  1026  			t.Fatal(err)
  1027  		}
  1028  		if err := buzzExp.verify(buzzJob.ID(), jobs.StatusFailed); err != nil {
  1029  			t.Fatal(err)
  1030  		}
  1031  	})
  1032  
  1033  	t.Run("FinishedWith", func(t *testing.T) {
  1034  		t.Run("nil error marks job as successful", func(t *testing.T) {
  1035  			job, exp := createDefaultJob()
  1036  			exp.FractionCompleted = 1.0
  1037  			if err := job.Started(ctx); err != nil {
  1038  				t.Fatal(err)
  1039  			}
  1040  			if err := job.Succeeded(ctx); err != nil {
  1041  				t.Fatal(err)
  1042  			}
  1043  			if err := exp.verify(job.ID(), jobs.StatusSucceeded); err != nil {
  1044  				t.Fatal(err)
  1045  			}
  1046  		})
  1047  
  1048  		t.Run("non-nil error marks job as failed", func(t *testing.T) {
  1049  			job, exp := createDefaultJob()
  1050  			boom := errors.New("boom")
  1051  			exp.Error = boom.Error()
  1052  			if err := job.Failed(ctx, boom); err != nil {
  1053  				t.Fatal(err)
  1054  			}
  1055  			if err := exp.verify(job.ID(), jobs.StatusFailed); err != nil {
  1056  				t.Fatal(err)
  1057  			}
  1058  		})
  1059  
  1060  		t.Run("internal errors are not swallowed if marking job as successful", func(t *testing.T) {
  1061  			job, _ := createDefaultJob()
  1062  			if _, err := sqlDB.Exec(
  1063  				`UPDATE system.jobs SET payload = 'garbage' WHERE id = $1`, *job.ID(),
  1064  			); err != nil {
  1065  				t.Fatal(err)
  1066  			}
  1067  			if err := job.Succeeded(ctx); !testutils.IsError(err, "wrong wireType") {
  1068  				t.Fatalf("unexpected: %v", err)
  1069  			}
  1070  		})
  1071  
  1072  		t.Run("internal errors are not swallowed if marking job as failed", func(t *testing.T) {
  1073  			job, _ := createDefaultJob()
  1074  			if _, err := sqlDB.Exec(
  1075  				`UPDATE system.jobs SET payload = 'garbage' WHERE id = $1`, *job.ID(),
  1076  			); err != nil {
  1077  				t.Fatal(err)
  1078  			}
  1079  			if err := job.Failed(ctx, errors.New("boom")); !testutils.IsError(err, "wrong wireType") {
  1080  				t.Fatalf("unexpected: %v", err)
  1081  			}
  1082  		})
  1083  	})
  1084  
  1085  	t.Run("cancelable jobs can be paused until finished", func(t *testing.T) {
  1086  		job, exp := startLeasedJob(t, defaultRecord)
  1087  
  1088  		if err := registry.PauseRequested(ctx, nil, *job.ID()); err != nil {
  1089  			t.Fatal(err)
  1090  		}
  1091  		if err := job.Paused(ctx); err != nil {
  1092  			t.Fatal(err)
  1093  		}
  1094  		if err := exp.verify(job.ID(), jobs.StatusPaused); err != nil {
  1095  			t.Fatal(err)
  1096  		}
  1097  		if err := registry.Resume(ctx, nil, *job.ID()); err != nil {
  1098  			t.Fatal(err)
  1099  		}
  1100  		// Resume the job again to ensure that the resumption is idempotent.
  1101  		if err := registry.Resume(ctx, nil, *job.ID()); err != nil {
  1102  			t.Fatal(err)
  1103  		}
  1104  		if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil {
  1105  			t.Fatal(err)
  1106  		}
  1107  
  1108  		// PauseRequested fails after job is successful.
  1109  		if err := job.Succeeded(ctx); err != nil {
  1110  			t.Fatal(err)
  1111  		}
  1112  		if err := registry.PauseRequested(ctx, nil, *job.ID()); !testutils.IsError(err, "cannot be requested to be paused") {
  1113  			t.Fatalf("expected 'cannot pause succeeded job', but got '%s'", err)
  1114  		}
  1115  	})
  1116  
  1117  	t.Run("cancelable jobs can be canceled until finished", func(t *testing.T) {
  1118  		{
  1119  			job, exp := startLeasedJob(t, defaultRecord)
  1120  			if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil {
  1121  				t.Fatal(err)
  1122  			}
  1123  			if err := exp.verify(job.ID(), jobs.StatusCancelRequested); err != nil {
  1124  				t.Fatal(err)
  1125  			}
  1126  		}
  1127  
  1128  		{
  1129  			job, exp := startLeasedJob(t, defaultRecord)
  1130  			if err := job.Started(ctx); err != nil {
  1131  				t.Fatal(err)
  1132  			}
  1133  			if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil {
  1134  				t.Fatal(err)
  1135  			}
  1136  			if err := exp.verify(job.ID(), jobs.StatusCancelRequested); err != nil {
  1137  				t.Fatal(err)
  1138  			}
  1139  		}
  1140  
  1141  		{
  1142  			job, exp := startLeasedJob(t, defaultRecord)
  1143  			if err := registry.PauseRequested(ctx, nil, *job.ID()); err != nil {
  1144  				t.Fatal(err)
  1145  			}
  1146  			if err := job.Paused(ctx); err != nil {
  1147  				t.Fatal(err)
  1148  			}
  1149  			if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil {
  1150  				t.Fatal(err)
  1151  			}
  1152  			if err := exp.verify(job.ID(), jobs.StatusCancelRequested); err != nil {
  1153  				t.Fatal(err)
  1154  			}
  1155  		}
  1156  
  1157  		{
  1158  			job, _ := startLeasedJob(t, defaultRecord)
  1159  			if err := job.Succeeded(ctx); err != nil {
  1160  				t.Fatal(err)
  1161  			}
  1162  			expectedErr := "job with status succeeded cannot be requested to be canceled"
  1163  			if err := registry.CancelRequested(ctx, nil, *job.ID()); !testutils.IsError(err, expectedErr) {
  1164  				t.Fatalf("expected '%s', but got '%s'", expectedErr, err)
  1165  			}
  1166  		}
  1167  	})
  1168  
  1169  	t.Run("unpaused jobs cannot be resumed", func(t *testing.T) {
  1170  		{
  1171  			job, _ := startLeasedJob(t, defaultRecord)
  1172  			if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil {
  1173  				t.Fatal(err)
  1174  			}
  1175  			if err := registry.Resume(ctx, nil, *job.ID()); !testutils.IsError(err, "cannot be resumed") {
  1176  				t.Errorf("got unexpected status '%v'", err)
  1177  			}
  1178  		}
  1179  
  1180  		{
  1181  			job, _ := startLeasedJob(t, defaultRecord)
  1182  			if err := job.Succeeded(ctx); err != nil {
  1183  				t.Fatal(err)
  1184  			}
  1185  			expectedErr := fmt.Sprintf("job with status %s cannot be resumed", jobs.StatusSucceeded)
  1186  			if err := registry.Resume(ctx, nil, *job.ID()); !testutils.IsError(err, expectedErr) {
  1187  				t.Errorf("expected '%s', but got '%v'", expectedErr, err)
  1188  			}
  1189  		}
  1190  	})
  1191  
  1192  	t.Run("bad job details fail", func(t *testing.T) {
  1193  		defer func() {
  1194  			if r, ok := recover().(string); !ok || !strings.Contains(r, "unknown details type int") {
  1195  				t.Fatalf("expected 'unknown details type int', but got: %v", r)
  1196  			}
  1197  		}()
  1198  
  1199  		job := registry.NewJob(jobs.Record{
  1200  			Details: 42,
  1201  		})
  1202  		_ = job.Created(ctx)
  1203  	})
  1204  
  1205  	t.Run("update before create fails", func(t *testing.T) {
  1206  		job := registry.NewJob(jobs.Record{
  1207  			Details:  jobspb.RestoreDetails{},
  1208  			Progress: jobspb.RestoreProgress{},
  1209  		})
  1210  		if err := job.Started(ctx); !testutils.IsError(err, "job not created") {
  1211  			t.Fatalf("expected 'job not created' error, but got %v", err)
  1212  		}
  1213  	})
  1214  
  1215  	t.Run("same state transition twice succeeds silently", func(t *testing.T) {
  1216  		job, _ := createDefaultJob()
  1217  		if err := job.Started(ctx); err != nil {
  1218  			t.Fatal(err)
  1219  		}
  1220  		if err := job.Started(ctx); err != nil {
  1221  			t.Fatal(err)
  1222  		}
  1223  		if err := job.Succeeded(ctx); err != nil {
  1224  			t.Fatal(err)
  1225  		}
  1226  		if err := job.Succeeded(ctx); err != nil {
  1227  			t.Fatal(err)
  1228  		}
  1229  	})
  1230  
  1231  	t.Run("high-water progress works", func(t *testing.T) {
  1232  		job, _ := createDefaultJob()
  1233  		if err := job.Started(ctx); err != nil {
  1234  			t.Fatal(err)
  1235  		}
  1236  		highWaters := []hlc.Timestamp{
  1237  			{WallTime: 1, Logical: 1},
  1238  			{WallTime: 2, Logical: 0},
  1239  		}
  1240  		for _, ts := range highWaters {
  1241  			if err := job.HighWaterProgressed(
  1242  				ctx, func(context.Context, *kv.Txn, jobspb.ProgressDetails) (hlc.Timestamp, error) { return ts, nil },
  1243  			); err != nil {
  1244  				t.Fatal(err)
  1245  			}
  1246  			p := job.Progress()
  1247  			if actual := *p.GetHighWater(); actual != ts {
  1248  				t.Fatalf(`got %s expected %s`, actual, ts)
  1249  			}
  1250  		}
  1251  	})
  1252  
  1253  	t.Run("out of bounds progress fails", func(t *testing.T) {
  1254  		job, _ := createDefaultJob()
  1255  		if err := job.Started(ctx); err != nil {
  1256  			t.Fatal(err)
  1257  		}
  1258  		if err := job.FractionProgressed(ctx, jobs.FractionUpdater(-0.1)); !testutils.IsError(err, "outside allowable range") {
  1259  			t.Fatalf("expected 'outside allowable range' error, but got %v", err)
  1260  		}
  1261  		if err := job.FractionProgressed(ctx, jobs.FractionUpdater(1.1)); !testutils.IsError(err, "outside allowable range") {
  1262  			t.Fatalf("expected 'outside allowable range' error, but got %v", err)
  1263  		}
  1264  		if err := job.HighWaterProgressed(
  1265  			ctx, func(context.Context, *kv.Txn, jobspb.ProgressDetails) (hlc.Timestamp, error) {
  1266  				return hlc.Timestamp{WallTime: -1}, nil
  1267  			},
  1268  		); !testutils.IsError(err, "outside allowable range") {
  1269  			t.Fatalf("expected 'outside allowable range' error, but got %v", err)
  1270  		}
  1271  	})
  1272  
  1273  	t.Run("error propagates", func(t *testing.T) {
  1274  		job, _ := createDefaultJob()
  1275  		if err := job.Started(ctx); err != nil {
  1276  			t.Fatal(err)
  1277  		}
  1278  		if err := job.HighWaterProgressed(
  1279  			ctx, func(context.Context, *kv.Txn, jobspb.ProgressDetails) (hlc.Timestamp, error) {
  1280  				return hlc.Timestamp{WallTime: 2}, errors.Errorf("boom")
  1281  			},
  1282  		); !testutils.IsError(err, "boom") {
  1283  			t.Fatalf("expected 'boom' error, but got %v", err)
  1284  		}
  1285  	})
  1286  
  1287  	t.Run("progress on finished job fails", func(t *testing.T) {
  1288  		job, _ := createDefaultJob()
  1289  		if err := job.Started(ctx); err != nil {
  1290  			t.Fatal(err)
  1291  		}
  1292  		if err := job.Succeeded(ctx); err != nil {
  1293  			t.Fatal(err)
  1294  		}
  1295  		if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.5)); !testutils.IsError(
  1296  			err, `cannot update progress on succeeded job \(id \d+\)`,
  1297  		) {
  1298  			t.Fatalf("expected 'cannot update progress' error, but got %v", err)
  1299  		}
  1300  	})
  1301  
  1302  	t.Run("progress on paused job fails", func(t *testing.T) {
  1303  		job, _ := startLeasedJob(t, defaultRecord)
  1304  		if err := registry.PauseRequested(ctx, nil, *job.ID()); err != nil {
  1305  			t.Fatal(err)
  1306  		}
  1307  		if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.5)); !testutils.IsError(
  1308  			err, `cannot update progress on pause-requested job`,
  1309  		) {
  1310  			t.Fatalf("expected progress error, but got %v", err)
  1311  		}
  1312  	})
  1313  
  1314  	t.Run("progress on canceled job fails", func(t *testing.T) {
  1315  		job, _ := startLeasedJob(t, defaultRecord)
  1316  		if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil {
  1317  			t.Fatal(err)
  1318  		}
  1319  		if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.5)); !testutils.IsError(
  1320  			err, `cannot update progress on cancel-requested job \(id \d+\)`,
  1321  		) {
  1322  			t.Fatalf("expected progress error, but got %v", err)
  1323  		}
  1324  	})
  1325  
  1326  	t.Run("succeeded forces fraction completed to 1.0", func(t *testing.T) {
  1327  		job, exp := createDefaultJob()
  1328  		if err := job.Started(ctx); err != nil {
  1329  			t.Fatal(err)
  1330  		}
  1331  		if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.2)); err != nil {
  1332  			t.Fatal(err)
  1333  		}
  1334  		if err := job.Succeeded(ctx); err != nil {
  1335  			t.Fatal(err)
  1336  		}
  1337  		exp.FractionCompleted = 1.0
  1338  		if err := exp.verify(job.ID(), jobs.StatusSucceeded); err != nil {
  1339  			t.Fatal(err)
  1340  		}
  1341  	})
  1342  
  1343  	t.Run("set details works", func(t *testing.T) {
  1344  		job, exp := createJob(jobs.Record{
  1345  			Details:  jobspb.RestoreDetails{},
  1346  			Progress: jobspb.RestoreProgress{},
  1347  		})
  1348  		if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil {
  1349  			t.Fatal(err)
  1350  		}
  1351  		newDetails := jobspb.RestoreDetails{URIs: []string{"new"}}
  1352  		exp.Record.Details = newDetails
  1353  		if err := job.SetDetails(ctx, newDetails); err != nil {
  1354  			t.Fatal(err)
  1355  		}
  1356  		if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil {
  1357  			t.Fatal(err)
  1358  		}
  1359  	})
  1360  
  1361  	t.Run("set progress works", func(t *testing.T) {
  1362  		job, exp := createJob(jobs.Record{
  1363  			Details:  jobspb.RestoreDetails{},
  1364  			Progress: jobspb.RestoreProgress{},
  1365  		})
  1366  		if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil {
  1367  			t.Fatal(err)
  1368  		}
  1369  		newDetails := jobspb.RestoreProgress{HighWater: []byte{42}}
  1370  		exp.Record.Progress = newDetails
  1371  		if err := job.SetProgress(ctx, newDetails); err != nil {
  1372  			t.Fatal(err)
  1373  		}
  1374  		if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil {
  1375  			t.Fatal(err)
  1376  		}
  1377  	})
  1378  
  1379  }
  1380  
  1381  // TestShowJobs manually inserts a row into system.jobs and checks that the
  1382  // encoded protobuf payload is properly decoded and visible in
  1383  // crdb_internal.jobs.
  1384  func TestShowJobs(t *testing.T) {
  1385  	defer leaktest.AfterTest(t)()
  1386  
  1387  	params, _ := tests.CreateTestServerParams()
  1388  	s, rawSQLDB, _ := serverutils.StartServer(t, params)
  1389  	sqlDB := sqlutils.MakeSQLRunner(rawSQLDB)
  1390  	defer s.Stopper().Stop(context.Background())
  1391  
  1392  	// row represents a row returned from crdb_internal.jobs, but
  1393  	// *not* a row in system.jobs.
  1394  	type row struct {
  1395  		id                int64
  1396  		typ               string
  1397  		status            string
  1398  		description       string
  1399  		username          string
  1400  		err               string
  1401  		created           time.Time
  1402  		started           time.Time
  1403  		finished          time.Time
  1404  		modified          time.Time
  1405  		fractionCompleted float32
  1406  		highWater         hlc.Timestamp
  1407  		coordinatorID     roachpb.NodeID
  1408  		details           jobspb.Details
  1409  	}
  1410  
  1411  	for _, in := range []row{
  1412  		{
  1413  			id:          42,
  1414  			typ:         "SCHEMA CHANGE",
  1415  			status:      "superfailed",
  1416  			description: "failjob",
  1417  			username:    "failure",
  1418  			err:         "boom",
  1419  			// lib/pq returns time.Time objects with goofy locations, which breaks
  1420  			// reflect.DeepEqual without this time.FixedZone song and dance.
  1421  			// See: https://github.com/lib/pq/issues/329
  1422  			created:           timeutil.Unix(1, 0).In(time.FixedZone("", 0)),
  1423  			started:           timeutil.Unix(2, 0).In(time.FixedZone("", 0)),
  1424  			finished:          timeutil.Unix(3, 0).In(time.FixedZone("", 0)),
  1425  			modified:          timeutil.Unix(4, 0).In(time.FixedZone("", 0)),
  1426  			fractionCompleted: 0.42,
  1427  			coordinatorID:     7,
  1428  			details:           jobspb.SchemaChangeDetails{},
  1429  		},
  1430  		{
  1431  			id:          43,
  1432  			typ:         "CHANGEFEED",
  1433  			status:      "running",
  1434  			description: "persistent feed",
  1435  			username:    "persistent",
  1436  			err:         "",
  1437  			// lib/pq returns time.Time objects with goofy locations, which breaks
  1438  			// reflect.DeepEqual without this time.FixedZone song and dance.
  1439  			// See: https://github.com/lib/pq/issues/329
  1440  			created:  timeutil.Unix(1, 0).In(time.FixedZone("", 0)),
  1441  			started:  timeutil.Unix(2, 0).In(time.FixedZone("", 0)),
  1442  			finished: timeutil.Unix(3, 0).In(time.FixedZone("", 0)),
  1443  			modified: timeutil.Unix(4, 0).In(time.FixedZone("", 0)),
  1444  			highWater: hlc.Timestamp{
  1445  				WallTime: 1533143242000000,
  1446  				Logical:  4,
  1447  			},
  1448  			coordinatorID: 7,
  1449  			details:       jobspb.ChangefeedDetails{},
  1450  		},
  1451  	} {
  1452  		t.Run("", func(t *testing.T) {
  1453  			// system.jobs is part proper SQL columns, part protobuf, so we can't use the
  1454  			// row struct directly.
  1455  			inPayload, err := protoutil.Marshal(&jobspb.Payload{
  1456  				Description:    in.description,
  1457  				StartedMicros:  in.started.UnixNano() / time.Microsecond.Nanoseconds(),
  1458  				FinishedMicros: in.finished.UnixNano() / time.Microsecond.Nanoseconds(),
  1459  				Username:       in.username,
  1460  				Lease: &jobspb.Lease{
  1461  					NodeID: 7,
  1462  				},
  1463  				Error:   in.err,
  1464  				Details: jobspb.WrapPayloadDetails(in.details),
  1465  			})
  1466  			if err != nil {
  1467  				t.Fatal(err)
  1468  			}
  1469  
  1470  			progress := &jobspb.Progress{
  1471  				ModifiedMicros: in.modified.UnixNano() / time.Microsecond.Nanoseconds(),
  1472  			}
  1473  			if in.highWater != (hlc.Timestamp{}) {
  1474  				progress.Progress = &jobspb.Progress_HighWater{
  1475  					HighWater: &in.highWater,
  1476  				}
  1477  			} else {
  1478  				progress.Progress = &jobspb.Progress_FractionCompleted{
  1479  					FractionCompleted: in.fractionCompleted,
  1480  				}
  1481  			}
  1482  			inProgress, err := protoutil.Marshal(progress)
  1483  			if err != nil {
  1484  				t.Fatal(err)
  1485  			}
  1486  			sqlDB.Exec(t,
  1487  				`INSERT INTO system.jobs (id, status, created, payload, progress) VALUES ($1, $2, $3, $4, $5)`,
  1488  				in.id, in.status, in.created, inPayload, inProgress,
  1489  			)
  1490  
  1491  			var out row
  1492  			var maybeFractionCompleted *float32
  1493  			var decimalHighWater *apd.Decimal
  1494  			sqlDB.QueryRow(t, `
  1495        SELECT job_id, job_type, status, created, description, started, finished, modified,
  1496               fraction_completed, high_water_timestamp, user_name, ifnull(error, ''), coordinator_id
  1497          FROM crdb_internal.jobs WHERE job_id = $1`, in.id).Scan(
  1498  				&out.id, &out.typ, &out.status, &out.created, &out.description, &out.started,
  1499  				&out.finished, &out.modified, &maybeFractionCompleted, &decimalHighWater, &out.username,
  1500  				&out.err, &out.coordinatorID,
  1501  			)
  1502  
  1503  			if decimalHighWater != nil {
  1504  				var err error
  1505  				out.highWater, err = tree.DecimalToHLC(decimalHighWater)
  1506  				if err != nil {
  1507  					t.Fatal(err)
  1508  				}
  1509  			}
  1510  
  1511  			if maybeFractionCompleted != nil {
  1512  				out.fractionCompleted = *maybeFractionCompleted
  1513  			}
  1514  
  1515  			// details field is not explicitly checked for equality; its value is
  1516  			// confirmed via the job_type field, which is dependent on the details
  1517  			// field.
  1518  			out.details = in.details
  1519  
  1520  			if !reflect.DeepEqual(in, out) {
  1521  				diff := strings.Join(pretty.Diff(in, out), "\n")
  1522  				t.Fatalf("in job did not match out job:\n%s", diff)
  1523  			}
  1524  		})
  1525  	}
  1526  }
  1527  
  1528  func TestShowAutomaticJobs(t *testing.T) {
  1529  	defer leaktest.AfterTest(t)()
  1530  
  1531  	params, _ := tests.CreateTestServerParams()
  1532  	s, rawSQLDB, _ := serverutils.StartServer(t, params)
  1533  	sqlDB := sqlutils.MakeSQLRunner(rawSQLDB)
  1534  	defer s.Stopper().Stop(context.Background())
  1535  
  1536  	// row represents a row returned from crdb_internal.jobs, but
  1537  	// *not* a row in system.jobs.
  1538  	type row struct {
  1539  		id      int64
  1540  		typ     string
  1541  		status  string
  1542  		details jobspb.Details
  1543  	}
  1544  
  1545  	rows := []row{
  1546  		{
  1547  			id:      1,
  1548  			typ:     "CREATE STATS",
  1549  			status:  "running",
  1550  			details: jobspb.CreateStatsDetails{Name: "my_stats"},
  1551  		},
  1552  		{
  1553  			id:      2,
  1554  			typ:     "AUTO CREATE STATS",
  1555  			status:  "running",
  1556  			details: jobspb.CreateStatsDetails{Name: "__auto__"},
  1557  		},
  1558  	}
  1559  
  1560  	for _, in := range rows {
  1561  		// system.jobs is part proper SQL columns, part protobuf, so we can't use the
  1562  		// row struct directly.
  1563  		inPayload, err := protoutil.Marshal(&jobspb.Payload{
  1564  			Details: jobspb.WrapPayloadDetails(in.details),
  1565  		})
  1566  		if err != nil {
  1567  			t.Fatal(err)
  1568  		}
  1569  
  1570  		sqlDB.Exec(t,
  1571  			`INSERT INTO system.jobs (id, status, payload) VALUES ($1, $2, $3)`,
  1572  			in.id, in.status, inPayload,
  1573  		)
  1574  	}
  1575  
  1576  	var out row
  1577  
  1578  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOB 1]`).Scan(&out.id, &out.typ)
  1579  	if out.id != 1 || out.typ != "CREATE STATS" {
  1580  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1581  			1, "CREATE STATS", out.id, out.typ)
  1582  	}
  1583  
  1584  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS SELECT 1]`).Scan(&out.id, &out.typ)
  1585  	if out.id != 1 || out.typ != "CREATE STATS" {
  1586  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1587  			1, "CREATE STATS", out.id, out.typ)
  1588  	}
  1589  
  1590  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS (SELECT 1)]`).Scan(&out.id, &out.typ)
  1591  	if out.id != 1 || out.typ != "CREATE STATS" {
  1592  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1593  			1, "CREATE STATS", out.id, out.typ)
  1594  	}
  1595  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOB 2]`).Scan(&out.id, &out.typ)
  1596  	if out.id != 2 || out.typ != "AUTO CREATE STATS" {
  1597  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1598  			2, "AUTO CREATE STATS", out.id, out.typ)
  1599  	}
  1600  
  1601  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS SELECT 2]`).Scan(&out.id, &out.typ)
  1602  	if out.id != 2 || out.typ != "AUTO CREATE STATS" {
  1603  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1604  			2, "AUTO CREATE STATS", out.id, out.typ)
  1605  	}
  1606  
  1607  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS]`).Scan(&out.id, &out.typ)
  1608  	if out.id != 1 || out.typ != "CREATE STATS" {
  1609  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1610  			1, "CREATE STATS", out.id, out.typ)
  1611  	}
  1612  
  1613  	sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW AUTOMATIC JOBS]`).Scan(&out.id, &out.typ)
  1614  	if out.id != 2 || out.typ != "AUTO CREATE STATS" {
  1615  		t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s",
  1616  			2, "AUTO CREATE STATS", out.id, out.typ)
  1617  	}
  1618  }
  1619  
  1620  func TestShowJobsWithError(t *testing.T) {
  1621  	defer leaktest.AfterTest(t)()
  1622  
  1623  	params, _ := tests.CreateTestServerParams()
  1624  	s, sqlDB, _ := serverutils.StartServer(t, params)
  1625  	defer s.Stopper().Stop(context.Background())
  1626  
  1627  	// Create at least 6 rows, ensuring 3 rows are corrupted.
  1628  	// Ensure there is at least one row in system.jobs.
  1629  	if _, err := sqlDB.Exec(`
  1630       CREATE TABLE foo(x INT); ALTER TABLE foo ADD COLUMN y INT;
  1631  	`); err != nil {
  1632  		t.Fatal(err)
  1633  	}
  1634  	// Get the id of the ADD COLUMN job to use later.
  1635  	var jobID int64
  1636  	if err := sqlDB.QueryRow(`SELECT id FROM system.jobs ORDER BY id DESC LIMIT 1`).Scan(&jobID); err != nil {
  1637  		t.Fatal(err)
  1638  	}
  1639  
  1640  	// Now insert more rows based on the valid row, some of which are corrupted.
  1641  	if _, err := sqlDB.Exec(`
  1642       -- Create a corrupted payload field from the most recent row.
  1643       INSERT INTO system.jobs(id, status, payload, progress) SELECT id+1, status, '\xaaaa'::BYTES, progress FROM system.jobs WHERE id = $1;
  1644  	`, jobID); err != nil {
  1645  		t.Fatal(err)
  1646  	}
  1647  	if _, err := sqlDB.Exec(`
  1648       -- Create a corrupted progress field.
  1649       INSERT INTO system.jobs(id, status, payload, progress) SELECT id+2, status, payload, '\xaaaa'::BYTES FROM system.jobs WHERE id = $1; 
  1650  	`, jobID); err != nil {
  1651  		t.Fatal(err)
  1652  	}
  1653  	if _, err := sqlDB.Exec(`
  1654       -- Corrupt both fields.
  1655       INSERT INTO system.jobs(id, status, payload, progress) SELECT id+3, status, '\xaaaa'::BYTES, '\xaaaa'::BYTES FROM system.jobs WHERE id = $1;
  1656  	`, jobID); err != nil {
  1657  		t.Fatal(err)
  1658  	}
  1659  	if _, err := sqlDB.Exec(`
  1660       -- Test what happens with a NULL progress field (which is a valid value).
  1661       INSERT INTO system.jobs(id, status, payload, progress) SELECT id+4, status, payload, NULL::BYTES FROM system.jobs WHERE id = $1;
  1662  	`, jobID); err != nil {
  1663  		t.Fatal(err)
  1664  	}
  1665  	if _, err := sqlDB.Exec(`
  1666       -- Test what happens with a NULL progress field (which is a valid value).
  1667       INSERT INTO system.jobs(id, status, payload, progress) SELECT id+5, status, '\xaaaa'::BYTES, NULL::BYTES FROM system.jobs WHERE id = $1;
  1668  	`, jobID); err != nil {
  1669  		t.Fatal(err)
  1670  	}
  1671  
  1672  	// Extract the last 6 rows from the query.
  1673  	rows, err := sqlDB.Query(`
  1674    WITH a AS (SELECT job_id, description, fraction_completed, error FROM [SHOW JOBS] ORDER BY job_id DESC LIMIT 6)
  1675    SELECT ifnull(description, 'NULL'), ifnull(fraction_completed, -1)::string, ifnull(error,'NULL') FROM a ORDER BY job_id ASC`)
  1676  	if err != nil {
  1677  		t.Fatal(err)
  1678  	}
  1679  	defer rows.Close()
  1680  
  1681  	var desc, frac, errStr string
  1682  
  1683  	// Valid row.
  1684  	rowNum := 0
  1685  	if !rows.Next() {
  1686  		t.Fatalf("%d too few rows", rowNum)
  1687  	}
  1688  	if err := rows.Scan(&desc, &frac, &errStr); err != nil {
  1689  		t.Fatalf("%d: %v", rowNum, err)
  1690  	}
  1691  	t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac)
  1692  	if desc == "NULL" || errStr != "" || frac[0] == '-' {
  1693  		t.Fatalf("%d: invalid row", rowNum)
  1694  	}
  1695  	rowNum++
  1696  
  1697  	// Corrupted payload but valid progress.
  1698  	if !rows.Next() {
  1699  		t.Fatalf("%d: too few rows", rowNum)
  1700  	}
  1701  	if err := rows.Scan(&desc, &frac, &errStr); err != nil {
  1702  		t.Fatalf("%d: %v", rowNum, err)
  1703  	}
  1704  	t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac)
  1705  	if desc != "NULL" || !strings.HasPrefix(errStr, "error decoding payload") || frac[0] == '-' {
  1706  		t.Fatalf("%d: invalid row", rowNum)
  1707  	}
  1708  	rowNum++
  1709  
  1710  	// Corrupted progress but valid payload.
  1711  	if !rows.Next() {
  1712  		t.Fatalf("%d: too few rows", rowNum)
  1713  	}
  1714  	if err := rows.Scan(&desc, &frac, &errStr); err != nil {
  1715  		t.Fatalf("%d: %v", rowNum, err)
  1716  	}
  1717  	t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac)
  1718  	if desc == "NULL" || !strings.HasPrefix(errStr, "error decoding progress") || frac[0] != '-' {
  1719  		t.Fatalf("%d: invalid row", rowNum)
  1720  	}
  1721  	rowNum++
  1722  
  1723  	// Both payload and progress corrupted.
  1724  	if !rows.Next() {
  1725  		t.Fatalf("%d: too few rows", rowNum)
  1726  	}
  1727  	if err := rows.Scan(&desc, &frac, &errStr); err != nil {
  1728  		t.Fatalf("%d: %v", rowNum, err)
  1729  	}
  1730  	t.Logf("row: %q %q %v", desc, errStr, frac)
  1731  	if desc != "NULL" ||
  1732  		!strings.Contains(errStr, "error decoding payload") ||
  1733  		!strings.Contains(errStr, "error decoding progress") ||
  1734  		frac[0] != '-' {
  1735  		t.Fatalf("%d: invalid row", rowNum)
  1736  	}
  1737  	rowNum++
  1738  
  1739  	// Valid payload and missing progress.
  1740  	if !rows.Next() {
  1741  		t.Fatalf("%d too few rows", rowNum)
  1742  	}
  1743  	if err := rows.Scan(&desc, &frac, &errStr); err != nil {
  1744  		t.Fatalf("%d: %v", rowNum, err)
  1745  	}
  1746  	t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac)
  1747  	if desc == "NULL" || errStr != "" || frac[0] != '-' {
  1748  		t.Fatalf("%d: invalid row", rowNum)
  1749  	}
  1750  	rowNum++
  1751  
  1752  	// Invalid payload and missing progress.
  1753  	if !rows.Next() {
  1754  		t.Fatalf("%d too few rows", rowNum)
  1755  	}
  1756  	if err := rows.Scan(&desc, &frac, &errStr); err != nil {
  1757  		t.Fatalf("%d: %v", rowNum, err)
  1758  	}
  1759  	t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac)
  1760  	if desc != "NULL" ||
  1761  		!strings.Contains(errStr, "error decoding payload") ||
  1762  		strings.Contains(errStr, "error decoding progress") ||
  1763  		frac[0] != '-' {
  1764  		t.Fatalf("%d: invalid row", rowNum)
  1765  	}
  1766  	rowNum++
  1767  }
  1768  
  1769  func TestShowJobWhenComplete(t *testing.T) {
  1770  	defer leaktest.AfterTest(t)()
  1771  	// Canceling a job relies on adopt daemon to move the job to state
  1772  	// reverting.
  1773  	defer func(oldInterval time.Duration) {
  1774  		jobs.DefaultAdoptInterval = oldInterval
  1775  	}(jobs.DefaultAdoptInterval)
  1776  	jobs.DefaultAdoptInterval = 10 * time.Millisecond
  1777  	ctx := context.Background()
  1778  	s, db, _ := serverutils.StartServer(t, base.TestServerArgs{})
  1779  	defer s.Stopper().Stop(ctx)
  1780  	registry := s.JobRegistry().(*jobs.Registry)
  1781  	mockJob := jobs.Record{Details: jobspb.ImportDetails{}, Progress: jobspb.ImportProgress{}}
  1782  	done := make(chan struct{})
  1783  	defer close(done)
  1784  	jobs.RegisterConstructor(
  1785  		jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer {
  1786  			return jobs.FakeResumer{
  1787  				OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
  1788  					select {
  1789  					case <-ctx.Done():
  1790  						return ctx.Err()
  1791  					case <-done:
  1792  						return nil
  1793  					}
  1794  				},
  1795  			}
  1796  		})
  1797  
  1798  	type row struct {
  1799  		id     int64
  1800  		status string
  1801  	}
  1802  	var out row
  1803  
  1804  	t.Run("show job", func(t *testing.T) {
  1805  		// Start a job and cancel it so it is in state finished and then query it with
  1806  		// SHOW JOB WHEN COMPLETE.
  1807  		job, _, err := registry.CreateAndStartJob(ctx, nil, mockJob)
  1808  		if err != nil {
  1809  			t.Fatal(err)
  1810  		}
  1811  		group := ctxgroup.WithContext(ctx)
  1812  		group.GoCtx(func(ctx context.Context) error {
  1813  			if err := db.QueryRowContext(
  1814  				ctx,
  1815  				`SELECT job_id, status
  1816  				 FROM [SHOW JOB WHEN COMPLETE $1]`,
  1817  				*job.ID()).Scan(&out.id, &out.status); err != nil {
  1818  				return err
  1819  			}
  1820  			if out.status != "canceled" {
  1821  				return errors.Errorf(
  1822  					"Expected status 'canceled' but got '%s'", out.status)
  1823  			}
  1824  			if *job.ID() != out.id {
  1825  				return errors.Errorf(
  1826  					"Expected job id %d but got %d", *job.ID(), out.id)
  1827  			}
  1828  			return nil
  1829  		})
  1830  		// Give a chance for the above group to schedule in order to test that
  1831  		// SHOW JOBS WHEN COMPLETE does block until the job is canceled.
  1832  		time.Sleep(2 * time.Millisecond)
  1833  		if _, err = db.ExecContext(ctx, "CANCEL JOB $1", *job.ID()); err == nil {
  1834  			err = group.Wait()
  1835  		}
  1836  		if err != nil {
  1837  			t.Fatal(err)
  1838  		}
  1839  	})
  1840  	t.Run("show jobs", func(t *testing.T) {
  1841  		// Start two jobs and cancel the first one to make sure the
  1842  		// query still blocks until the second job is also canceled.
  1843  		var jobs [2]*jobs.Job
  1844  		for i := range jobs {
  1845  			job, _, err := registry.CreateAndStartJob(ctx, nil, mockJob)
  1846  			if err != nil {
  1847  				t.Fatal(err)
  1848  			}
  1849  			jobs[i] = job
  1850  		}
  1851  		if _, err := db.ExecContext(ctx, "CANCEL JOB $1", *jobs[0].ID()); err != nil {
  1852  			t.Fatal(err)
  1853  		}
  1854  		group := ctxgroup.WithContext(ctx)
  1855  		group.GoCtx(func(ctx context.Context) error {
  1856  			rows, err := db.QueryContext(ctx,
  1857  				`SELECT job_id, status
  1858  				 FROM [SHOW JOBS WHEN COMPLETE (SELECT $1 UNION SELECT $2)]`,
  1859  				*jobs[0].ID(), *jobs[1].ID())
  1860  			if err != nil {
  1861  				return err
  1862  			}
  1863  			var cnt int
  1864  			for rows.Next() {
  1865  				if err := rows.Scan(&out.id, &out.status); err != nil {
  1866  					return err
  1867  				}
  1868  				cnt += 1
  1869  				switch out.id {
  1870  				case *jobs[0].ID():
  1871  				case *jobs[1].ID():
  1872  					// SHOW JOBS WHEN COMPLETE finishes only after all jobs are
  1873  					// canceled.
  1874  					if out.status != "canceled" {
  1875  						return errors.Errorf(
  1876  							"Expected status 'canceled' but got '%s'",
  1877  							out.status)
  1878  					}
  1879  				default:
  1880  					return errors.Errorf(
  1881  						"Expected either id:%d or id:%d but got: %d",
  1882  						*jobs[0].ID(), *jobs[1].ID(), out.id)
  1883  				}
  1884  			}
  1885  			if cnt != 2 {
  1886  				return errors.Errorf("Expected 2 results but found %d", cnt)
  1887  			}
  1888  			return nil
  1889  		})
  1890  		// Give a chance for the above group to schedule in order to test that
  1891  		// SHOW JOBS WHEN COMPLETE does block until the job is canceled.
  1892  		time.Sleep(2 * time.Millisecond)
  1893  		var err error
  1894  		if _, err = db.ExecContext(ctx, "CANCEL JOB $1", *jobs[1].ID()); err == nil {
  1895  			err = group.Wait()
  1896  		}
  1897  		if err != nil {
  1898  			t.Fatal(err)
  1899  		}
  1900  	})
  1901  }
  1902  
  1903  func TestJobInTxn(t *testing.T) {
  1904  	defer leaktest.AfterTest(t)()
  1905  	defer jobs.ResetConstructors()()
  1906  
  1907  	defer func(oldInterval time.Duration) {
  1908  		jobs.DefaultAdoptInterval = oldInterval
  1909  	}(jobs.DefaultAdoptInterval)
  1910  	jobs.DefaultAdoptInterval = 5 * time.Second
  1911  
  1912  	ctx := context.Background()
  1913  	s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{})
  1914  	defer s.Stopper().Stop(ctx)
  1915  
  1916  	// Accessed atomically.
  1917  	var hasRun int32
  1918  	var job *jobs.Job
  1919  
  1920  	defer sql.ClearPlanHooks()
  1921  	// Piggy back on BACKUP to be able to create a succeeding test job.
  1922  	sql.AddPlanHook(
  1923  		func(_ context.Context, stmt tree.Statement, phs sql.PlanHookState,
  1924  		) (sql.PlanHookRowFn, sqlbase.ResultColumns, []sql.PlanNode, bool, error) {
  1925  			st, ok := stmt.(*tree.Backup)
  1926  			if !ok {
  1927  				return nil, nil, nil, false, nil
  1928  			}
  1929  			fn := func(_ context.Context, _ []sql.PlanNode, _ chan<- tree.Datums) error {
  1930  				var err error
  1931  				job, err = phs.ExtendedEvalContext().QueueJob(
  1932  					jobs.Record{
  1933  						Description: st.String(),
  1934  						Details:     jobspb.BackupDetails{},
  1935  						Progress:    jobspb.BackupProgress{},
  1936  					},
  1937  				)
  1938  				return err
  1939  			}
  1940  			return fn, nil, nil, false, nil
  1941  		},
  1942  	)
  1943  	jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
  1944  		return jobs.FakeResumer{
  1945  			OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
  1946  				t.Logf("Resuming job: %+v", job.Payload())
  1947  				atomic.AddInt32(&hasRun, 1)
  1948  				return nil
  1949  			},
  1950  			FailOrCancel: func(ctx context.Context) error {
  1951  				atomic.AddInt32(&hasRun, 1)
  1952  				return nil
  1953  			},
  1954  		}
  1955  	})
  1956  	// Piggy back on RESTORE to be able to create a failing test job.
  1957  	sql.AddPlanHook(
  1958  		func(_ context.Context, stmt tree.Statement, phs sql.PlanHookState,
  1959  		) (sql.PlanHookRowFn, sqlbase.ResultColumns, []sql.PlanNode, bool, error) {
  1960  			_, ok := stmt.(*tree.Restore)
  1961  			if !ok {
  1962  				return nil, nil, nil, false, nil
  1963  			}
  1964  			fn := func(_ context.Context, _ []sql.PlanNode, _ chan<- tree.Datums) error {
  1965  				var err error
  1966  				job, err = phs.ExtendedEvalContext().QueueJob(
  1967  					jobs.Record{
  1968  						Description: "RESTORE",
  1969  						Details:     jobspb.RestoreDetails{},
  1970  						Progress:    jobspb.RestoreProgress{},
  1971  					},
  1972  				)
  1973  				return err
  1974  			}
  1975  			return fn, nil, nil, false, nil
  1976  		},
  1977  	)
  1978  	jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
  1979  		return jobs.FakeResumer{
  1980  			OnResume: func(_ context.Context, _ chan<- tree.Datums) error {
  1981  				return errors.New("RESTORE failed")
  1982  			},
  1983  		}
  1984  	})
  1985  
  1986  	t.Run("rollback txn", func(t *testing.T) {
  1987  		start := timeutil.Now()
  1988  
  1989  		txn, err := sqlDB.Begin()
  1990  		require.NoError(t, err)
  1991  		_, err = txn.Exec("BACKUP tobeaborted TO doesnotmattter")
  1992  		require.NoError(t, err)
  1993  
  1994  		// If we rollback then the job should not run
  1995  		require.NoError(t, txn.Rollback())
  1996  		registry := s.JobRegistry().(*jobs.Registry)
  1997  		_, err = registry.LoadJob(ctx, *job.ID())
  1998  		require.Error(t, err, "the job should not exist after the txn is rolled back")
  1999  		require.True(t, jobs.HasJobNotFoundError(err))
  2000  
  2001  		sqlRunner := sqlutils.MakeSQLRunner(sqlDB)
  2002  		// Just in case the job was scheduled let's wait for it to finish
  2003  		// to avoid a race.
  2004  		sqlRunner.Exec(t, "SHOW JOB WHEN COMPLETE $1", *job.ID())
  2005  		require.Equal(t, int32(0), atomic.LoadInt32(&hasRun),
  2006  			"job has run in transaction before txn commit")
  2007  
  2008  		require.True(t, timeutil.Since(start) < jobs.DefaultAdoptInterval, "job should have been adopted immediately")
  2009  	})
  2010  
  2011  	t.Run("normal success", func(t *testing.T) {
  2012  		start := timeutil.Now()
  2013  
  2014  		// Now let's actually commit the transaction and check that the job ran.
  2015  		txn, err := sqlDB.Begin()
  2016  		require.NoError(t, err)
  2017  		_, err = txn.Exec("BACKUP tocommit TO foo")
  2018  		require.NoError(t, err)
  2019  		// Committing will block and wait for all jobs to run.
  2020  		require.NoError(t, txn.Commit())
  2021  		registry := s.JobRegistry().(*jobs.Registry)
  2022  		j, err := registry.LoadJob(ctx, *job.ID())
  2023  		require.NoError(t, err, "queued job not found")
  2024  		require.NotEqual(t, int32(0), atomic.LoadInt32(&hasRun),
  2025  			"job scheduled in transaction did not run")
  2026  		require.Equal(t, int32(1), atomic.LoadInt32(&hasRun),
  2027  			"more than one job ran")
  2028  		require.Equal(t, "", j.Payload().Error)
  2029  
  2030  		require.True(t, timeutil.Since(start) < jobs.DefaultAdoptInterval, "job should have been adopted immediately")
  2031  	})
  2032  
  2033  	t.Run("one of the queued jobs fails", func(t *testing.T) {
  2034  		start := timeutil.Now()
  2035  		txn, err := sqlDB.Begin()
  2036  		require.NoError(t, err)
  2037  
  2038  		// Add a succeeding job.
  2039  		_, err = txn.Exec("BACKUP doesnotmatter TO doesnotmattter")
  2040  		require.NoError(t, err)
  2041  		// We hooked up a failing test job to RESTORE.
  2042  		_, err = txn.Exec("RESTORE TABLE tbl FROM somewhere")
  2043  		require.NoError(t, err)
  2044  
  2045  		// Now let's actually commit the transaction and check that there is a
  2046  		// failure.
  2047  		require.Error(t, txn.Commit())
  2048  		require.True(t, timeutil.Since(start) < jobs.DefaultAdoptInterval, "job should have been adopted immediately")
  2049  	})
  2050  }
  2051  
  2052  // TestStartableJobErrors tests that the StartableJob returns the expected
  2053  // errors when used incorrectly and performs the appropriate cleanup in
  2054  // CleanupOnRollback.
  2055  func TestStartableJob(t *testing.T) {
  2056  	defer leaktest.AfterTest(t)()
  2057  	defer jobs.ResetConstructors()()
  2058  
  2059  	ctx := context.Background()
  2060  	s, _, db := serverutils.StartServer(t, base.TestServerArgs{})
  2061  	defer s.Stopper().Stop(ctx)
  2062  	jr := s.JobRegistry().(*jobs.Registry)
  2063  	var resumeFunc atomic.Value
  2064  	resumeFunc.Store(func(ctx context.Context, _ chan<- tree.Datums) error {
  2065  		return nil
  2066  	})
  2067  	setResumeFunc := func(f func(ctx context.Context, _ chan<- tree.Datums) error) (cleanup func()) {
  2068  		prev := resumeFunc.Load()
  2069  		resumeFunc.Store(f)
  2070  		return func() { resumeFunc.Store(prev) }
  2071  	}
  2072  	jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
  2073  		return jobs.FakeResumer{
  2074  			OnResume: func(ctx context.Context, resultsCh chan<- tree.Datums) error {
  2075  				return resumeFunc.Load().(func(ctx context.Context, _ chan<- tree.Datums) error)(ctx, resultsCh)
  2076  			},
  2077  		}
  2078  	})
  2079  	rec := jobs.Record{
  2080  		Description:   "There's a snake in my boot!",
  2081  		Username:      "Woody Pride",
  2082  		DescriptorIDs: []sqlbase.ID{1, 2, 3},
  2083  		Details:       jobspb.RestoreDetails{},
  2084  		Progress:      jobspb.RestoreProgress{},
  2085  	}
  2086  	createStartableJob := func(t *testing.T) (sj *jobs.StartableJob) {
  2087  		require.NoError(t, db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) {
  2088  			sj, err = jr.CreateStartableJobWithTxn(ctx, rec, txn, nil)
  2089  			return err
  2090  		}))
  2091  		return sj
  2092  	}
  2093  	t.Run("Start called more than once", func(t *testing.T) {
  2094  		sj := createStartableJob(t)
  2095  		_, err := sj.Start(ctx)
  2096  		require.NoError(t, err)
  2097  		_, err = sj.Start(ctx)
  2098  		require.Regexp(t, `StartableJob \d+ cannot be started more than once`, err)
  2099  	})
  2100  	t.Run("Start called with active txn", func(t *testing.T) {
  2101  		txn := db.NewTxn(ctx, "test")
  2102  		defer func() {
  2103  			require.NoError(t, txn.Rollback(ctx))
  2104  		}()
  2105  		sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil)
  2106  		require.NoError(t, err)
  2107  		_, err = sj.Start(ctx)
  2108  		require.Regexp(t, `cannot resume .* job which is not committed`, err)
  2109  	})
  2110  	t.Run("Start called with aborted txn", func(t *testing.T) {
  2111  		txn := db.NewTxn(ctx, "test")
  2112  		sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil)
  2113  		require.NoError(t, err)
  2114  		require.NoError(t, txn.Rollback(ctx))
  2115  		_, err = sj.Start(ctx)
  2116  		require.Regexp(t, `cannot resume .* job which is not committed`, err)
  2117  	})
  2118  	t.Run("CleanupOnRollback called with active txn", func(t *testing.T) {
  2119  		txn := db.NewTxn(ctx, "test")
  2120  		defer func() {
  2121  			require.NoError(t, txn.Rollback(ctx))
  2122  		}()
  2123  		sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil)
  2124  		require.NoError(t, err)
  2125  		err = sj.CleanupOnRollback(ctx)
  2126  		require.Regexp(t, `cannot call CleanupOnRollback for a StartableJob with a non-finalized transaction`, err)
  2127  	})
  2128  	t.Run("CleanupOnRollback called with committed txn", func(t *testing.T) {
  2129  		sj := createStartableJob(t)
  2130  		err := sj.CleanupOnRollback(ctx)
  2131  		require.Regexp(t, `cannot call CleanupOnRollback for a StartableJob created by a committed transaction`, err)
  2132  	})
  2133  	t.Run("CleanupOnRollback positive case", func(t *testing.T) {
  2134  		txn := db.NewTxn(ctx, "test")
  2135  		sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil)
  2136  		require.NoError(t, err)
  2137  		require.NoError(t, txn.Rollback(ctx))
  2138  		require.NoError(t, sj.CleanupOnRollback(ctx))
  2139  		for _, id := range jr.CurrentlyRunningJobs() {
  2140  			require.NotEqual(t, id, *sj.ID())
  2141  		}
  2142  	})
  2143  	t.Run("Cancel", func(t *testing.T) {
  2144  		txn := db.NewTxn(ctx, "test")
  2145  		sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil)
  2146  		require.NoError(t, err)
  2147  		require.NoError(t, txn.Commit(ctx))
  2148  		require.NoError(t, sj.Cancel(ctx))
  2149  		status, err := sj.CurrentStatus(ctx)
  2150  		require.NoError(t, err)
  2151  		require.Equal(t, jobs.StatusCancelRequested, status)
  2152  		for _, id := range jr.CurrentlyRunningJobs() {
  2153  			require.NotEqual(t, id, *sj.ID())
  2154  		}
  2155  		_, err = sj.Start(ctx)
  2156  		require.Regexp(t, "job with status cancel-requested cannot be marked started", err)
  2157  	})
  2158  	setUpRunTest := func(t *testing.T) (
  2159  		sj *jobs.StartableJob,
  2160  		resultCh <-chan tree.Datums,
  2161  		blockResume func() (waitForBlocked func() (resultsCh chan<- tree.Datums, unblockWithError func(error))),
  2162  		cleanup func(),
  2163  	) {
  2164  		type blockResp struct {
  2165  			errCh     chan error
  2166  			resultsCh chan<- tree.Datums
  2167  		}
  2168  		blockCh := make(chan chan blockResp, 1)
  2169  		blockResume = func() (waitForBlocked func() (resultsCh chan<- tree.Datums, unblockWithError func(error))) {
  2170  			blockRequest := make(chan blockResp, 1)
  2171  			blockCh <- blockRequest // from test to resumer
  2172  			return func() (resultsCh chan<- tree.Datums, unblockWithError func(error)) {
  2173  				blocked := <-blockRequest // from resumer to test
  2174  				return blocked.resultsCh, func(err error) {
  2175  					blocked.errCh <- err // from test to resumer
  2176  				}
  2177  			}
  2178  		}
  2179  		cleanup = setResumeFunc(func(ctx context.Context, resultsCh chan<- tree.Datums) error {
  2180  			select {
  2181  			case blockRequest := <-blockCh:
  2182  				unblock := make(chan error)
  2183  				blockRequest <- blockResp{
  2184  					errCh:     unblock,
  2185  					resultsCh: resultsCh,
  2186  				}
  2187  				if err := <-unblock; err != nil {
  2188  					return err
  2189  				}
  2190  			default:
  2191  			}
  2192  			return nil
  2193  		})
  2194  		clientResults := make(chan tree.Datums)
  2195  		require.NoError(t, db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) {
  2196  			sj, err = jr.CreateStartableJobWithTxn(ctx, rec, txn, clientResults)
  2197  			return err
  2198  		}))
  2199  		return sj, clientResults, blockResume, cleanup
  2200  	}
  2201  	t.Run("Run - error during resume", func(t *testing.T) {
  2202  		sj, _, blockResume, cleanup := setUpRunTest(t)
  2203  		defer cleanup()
  2204  		waitForBlocked := blockResume()
  2205  		runErr := make(chan error)
  2206  		go func() { runErr <- sj.Run(ctx) }()
  2207  		_, unblock := waitForBlocked()
  2208  		unblock(errors.New("boom"))
  2209  		require.Regexp(t, "boom", <-runErr)
  2210  	})
  2211  	t.Run("Run - client canceled", func(t *testing.T) {
  2212  		sj, _, blockResume, cleanup := setUpRunTest(t)
  2213  		defer cleanup()
  2214  		ctxToCancel, cancel := context.WithCancel(ctx)
  2215  		runErr := make(chan error)
  2216  		waitForBlocked := blockResume()
  2217  		go func() { runErr <- sj.Run(ctxToCancel) }()
  2218  		resultsCh, unblock := waitForBlocked()
  2219  		cancel()
  2220  		require.Regexp(t, context.Canceled, <-runErr)
  2221  		resultsCh <- tree.Datums{}
  2222  		unblock(nil)
  2223  		testutils.SucceedsSoon(t, func() error {
  2224  			loaded, err := jr.LoadJob(ctx, *sj.ID())
  2225  			require.NoError(t, err)
  2226  			st, err := loaded.CurrentStatus(ctx)
  2227  			require.NoError(t, err)
  2228  			if st != jobs.StatusSucceeded {
  2229  				return errors.Errorf("expected %s, got %s", jobs.StatusSucceeded, st)
  2230  			}
  2231  			return nil
  2232  		})
  2233  	})
  2234  	t.Run("Run - results chan closed", func(t *testing.T) {
  2235  		sj, clientResultsChan, blockResume, cleanup := setUpRunTest(t)
  2236  		defer cleanup()
  2237  		runErr := make(chan error)
  2238  		waitForBlocked := blockResume()
  2239  		go func() { runErr <- sj.Run(ctx) }()
  2240  		resultsCh, unblock := waitForBlocked()
  2241  		go func() {
  2242  			_, ok := <-clientResultsChan
  2243  			assert.True(t, ok)
  2244  			_, ok = <-clientResultsChan
  2245  			assert.False(t, ok)
  2246  			unblock(nil)
  2247  		}()
  2248  		resultsCh <- tree.Datums{}
  2249  		close(resultsCh)
  2250  		require.NoError(t, <-runErr)
  2251  	})
  2252  }
  2253  
  2254  // TestUnmigratedSchemaChangeJobs tests that schema change jobs created in 19.2
  2255  // that have not undergone a migration cannot be adopted, canceled, or paused.
  2256  func TestUnmigratedSchemaChangeJobs(t *testing.T) {
  2257  	defer leaktest.AfterTest(t)()
  2258  	defer jobs.ResetConstructors()()
  2259  	defer func(oldInterval time.Duration) {
  2260  		jobs.DefaultAdoptInterval = oldInterval
  2261  	}(jobs.DefaultAdoptInterval)
  2262  	jobs.DefaultAdoptInterval = 10 * time.Millisecond
  2263  
  2264  	ctx := context.Background()
  2265  
  2266  	s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{})
  2267  	defer s.Stopper().Stop(ctx)
  2268  
  2269  	registry := s.JobRegistry().(*jobs.Registry)
  2270  
  2271  	// The default FormatVersion value in SchemaChangeDetails corresponds to a
  2272  	// pre-20.1 job.
  2273  	rec := jobs.Record{
  2274  		DescriptorIDs: []sqlbase.ID{1},
  2275  		Details:       jobspb.SchemaChangeDetails{},
  2276  		Progress:      jobspb.SchemaChangeProgress{},
  2277  	}
  2278  
  2279  	t.Run("job is not adopted", func(t *testing.T) {
  2280  		resuming := make(chan struct{})
  2281  		jobs.RegisterConstructor(jobspb.TypeSchemaChange, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer {
  2282  			return jobs.FakeResumer{
  2283  				OnResume: func(ctx context.Context, _ chan<- tree.Datums) error {
  2284  					resuming <- struct{}{}
  2285  					return nil
  2286  				},
  2287  			}
  2288  		})
  2289  		select {
  2290  		case <-resuming:
  2291  			t.Fatal("job was resumed")
  2292  		case <-time.After(time.Second):
  2293  			// With an adopt interval of 10 ms, within 1 s we can be reasonably sure
  2294  			// that the job was not adopted.
  2295  		}
  2296  	})
  2297  
  2298  	t.Run("pause not supported", func(t *testing.T) {
  2299  		job, err := registry.CreateJobWithTxn(ctx, rec, nil /* txn */)
  2300  		if err != nil {
  2301  			t.Fatal(err)
  2302  		}
  2303  		if _, err := sqlDB.Exec("PAUSE JOB $1", *job.ID()); !testutils.IsError(err, "cannot be paused in this version") {
  2304  			t.Fatal(err)
  2305  		}
  2306  	})
  2307  
  2308  	t.Run("cancel not supported", func(t *testing.T) {
  2309  		job, err := registry.CreateJobWithTxn(ctx, rec, nil /* txn */)
  2310  		if err != nil {
  2311  			t.Fatal(err)
  2312  		}
  2313  		if _, err := sqlDB.Exec("CANCEL JOB $1", *job.ID()); !testutils.IsError(err, "cannot be canceled in this version") {
  2314  			t.Fatal(err)
  2315  		}
  2316  	})
  2317  }