github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/jobs_test.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package jobs_test 12 13 import ( 14 "context" 15 gosql "database/sql" 16 "fmt" 17 "reflect" 18 "strings" 19 "sync/atomic" 20 "testing" 21 "time" 22 23 "github.com/cockroachdb/apd" 24 "github.com/cockroachdb/cockroach/pkg/base" 25 "github.com/cockroachdb/cockroach/pkg/jobs" 26 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 27 "github.com/cockroachdb/cockroach/pkg/kv" 28 "github.com/cockroachdb/cockroach/pkg/roachpb" 29 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 30 "github.com/cockroachdb/cockroach/pkg/sql" 31 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 32 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 33 "github.com/cockroachdb/cockroach/pkg/sql/tests" 34 "github.com/cockroachdb/cockroach/pkg/testutils" 35 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 36 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 37 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 38 "github.com/cockroachdb/cockroach/pkg/util/hlc" 39 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 40 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 41 "github.com/cockroachdb/cockroach/pkg/util/retry" 42 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 43 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 44 "github.com/cockroachdb/errors" 45 "github.com/google/go-cmp/cmp" 46 "github.com/kr/pretty" 47 "github.com/stretchr/testify/assert" 48 "github.com/stretchr/testify/require" 49 ) 50 51 // expectation defines the information necessary to determine the validity of 52 // a job in the system.jobs table. 53 type expectation struct { 54 DB *gosql.DB 55 Record jobs.Record 56 Type jobspb.Type 57 Before time.Time 58 FractionCompleted float32 59 Error string 60 } 61 62 func (expected *expectation) verify(id *int64, expectedStatus jobs.Status) error { 63 var statusString string 64 var created time.Time 65 var payloadBytes []byte 66 var progressBytes []byte 67 if err := expected.DB.QueryRow( 68 `SELECT status, created, payload, progress FROM system.jobs WHERE id = $1`, id, 69 ).Scan( 70 &statusString, &created, &payloadBytes, &progressBytes, 71 ); err != nil { 72 return err 73 } 74 75 var payload jobspb.Payload 76 if err := protoutil.Unmarshal(payloadBytes, &payload); err != nil { 77 return err 78 } 79 var progress jobspb.Progress 80 if err := protoutil.Unmarshal(progressBytes, &progress); err != nil { 81 return err 82 } 83 84 // Verify the upstream-provided fields. 85 details := payload.UnwrapDetails() 86 progressDetail := progress.UnwrapDetails() 87 88 if e, a := expected.Record, (jobs.Record{ 89 Description: payload.Description, 90 Details: details, 91 DescriptorIDs: payload.DescriptorIDs, 92 Username: payload.Username, 93 Progress: progressDetail, 94 }); !reflect.DeepEqual(e, a) { 95 diff := strings.Join(pretty.Diff(e, a), "\n") 96 return errors.Errorf("Records do not match:\n%s", diff) 97 } 98 99 // Verify internally-managed fields. 100 status := jobs.Status(statusString) 101 if e, a := expectedStatus, status; e != a { 102 return errors.Errorf("expected status %v, got %v", e, a) 103 } 104 if e, a := expected.Type, payload.Type(); e != a { 105 return errors.Errorf("expected type %v, got type %v", e, a) 106 } 107 if e, a := expected.FractionCompleted, progress.GetFractionCompleted(); e != a { 108 return errors.Errorf("expected fraction completed %f, got %f", e, a) 109 } 110 111 started := timeutil.FromUnixMicros(payload.StartedMicros) 112 if started.Equal(timeutil.UnixEpoch) && status == jobs.StatusSucceeded { 113 return errors.Errorf("started time is empty but job claims to be successful") 114 } 115 if status == jobs.StatusRunning || status == jobs.StatusPauseRequested { 116 return nil 117 } 118 119 if e, a := expected.Error, payload.Error; e != a { 120 return errors.Errorf("expected error %q, got %q", e, a) 121 } 122 return nil 123 } 124 125 func TestJobsTableProgressFamily(t *testing.T) { 126 defer leaktest.AfterTest(t)() 127 128 ctx := context.Background() 129 s, db, _ := serverutils.StartServer(t, base.TestServerArgs{}) 130 defer s.Stopper().Stop(ctx) 131 132 var table, schema string 133 sqlutils.MakeSQLRunner(db).QueryRow(t, `SHOW CREATE system.jobs`).Scan(&table, &schema) 134 if !strings.Contains(schema, `FAMILY progress (progress)`) { 135 t.Fatalf("expected progress family, got %q", schema) 136 } 137 } 138 139 type counters struct { 140 ResumeExit, OnFailOrCancelExit int 141 // These sometimes retry so just use bool. 142 ResumeStart, OnFailOrCancelStart, Success bool 143 } 144 145 type registryTestSuite struct { 146 ctx context.Context 147 oldInterval time.Duration 148 s serverutils.TestServerInterface 149 outerDB *gosql.DB 150 sqlDB *sqlutils.SQLRunner 151 registry *jobs.Registry 152 done chan struct{} 153 mockJob jobs.Record 154 job *jobs.Job 155 mu struct { 156 syncutil.Mutex 157 a counters 158 e counters 159 } 160 resumeCh chan error 161 progressCh chan struct{} 162 failOrCancelCh chan error 163 resumeCheckCh chan struct{} 164 failOrCancelCheckCh chan struct{} 165 onPauseRequest jobs.OnPauseRequestFunc 166 // Instead of a ch for success, use a variable because it can retry since it 167 // is in a transaction. 168 successErr error 169 } 170 171 func noopPauseRequestFunc( 172 ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress, 173 ) error { 174 return nil 175 } 176 177 func (rts *registryTestSuite) setUp(t *testing.T) { 178 rts.oldInterval = jobs.DefaultAdoptInterval 179 jobs.DefaultAdoptInterval = time.Millisecond 180 rts.ctx = context.Background() 181 rts.s, rts.outerDB, _ = serverutils.StartServer(t, base.TestServerArgs{}) 182 rts.sqlDB = sqlutils.MakeSQLRunner(rts.outerDB) 183 rts.registry = rts.s.JobRegistry().(*jobs.Registry) 184 rts.done = make(chan struct{}) 185 rts.mockJob = jobs.Record{Details: jobspb.ImportDetails{}, Progress: jobspb.ImportProgress{}} 186 187 rts.resumeCh = make(chan error) 188 rts.progressCh = make(chan struct{}) 189 rts.failOrCancelCh = make(chan error) 190 rts.resumeCheckCh = make(chan struct{}) 191 rts.failOrCancelCheckCh = make(chan struct{}) 192 rts.onPauseRequest = noopPauseRequestFunc 193 194 jobs.RegisterConstructor(jobspb.TypeImport, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { 195 return jobs.FakeResumer{ 196 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 197 t.Log("Starting resume") 198 rts.mu.Lock() 199 rts.mu.a.ResumeStart = true 200 rts.mu.Unlock() 201 defer func() { 202 rts.mu.Lock() 203 rts.mu.a.ResumeExit++ 204 rts.mu.Unlock() 205 t.Log("Exiting resume") 206 }() 207 for { 208 <-rts.resumeCheckCh 209 select { 210 case <-ctx.Done(): 211 rts.mu.Lock() 212 rts.mu.a.ResumeExit-- 213 rts.mu.Unlock() 214 return ctx.Err() 215 case err := <-rts.resumeCh: 216 return err 217 case <-rts.progressCh: 218 err := job.FractionProgressed(rts.ctx, jobs.FractionUpdater(0)) 219 if err != nil { 220 return err 221 } 222 } 223 } 224 }, 225 FailOrCancel: func(ctx context.Context) error { 226 t.Log("Starting FailOrCancel") 227 rts.mu.Lock() 228 rts.mu.a.OnFailOrCancelStart = true 229 rts.mu.Unlock() 230 defer func() { 231 rts.mu.Lock() 232 rts.mu.a.OnFailOrCancelExit++ 233 rts.mu.Unlock() 234 t.Log("Exiting OnFailOrCancel") 235 }() 236 <-rts.failOrCancelCheckCh 237 select { 238 case <-ctx.Done(): 239 rts.mu.Lock() 240 rts.mu.a.OnFailOrCancelExit-- 241 rts.mu.Unlock() 242 return ctx.Err() 243 case err := <-rts.failOrCancelCh: 244 return err 245 } 246 }, 247 248 Success: func() error { 249 t.Log("Starting success") 250 rts.mu.Lock() 251 defer func() { 252 rts.mu.Unlock() 253 t.Log("Exiting success") 254 }() 255 rts.mu.a.Success = true 256 return rts.successErr 257 }, 258 PauseRequest: func(ctx context.Context, execCfg interface{}, txn *kv.Txn, progress *jobspb.Progress) error { 259 return rts.onPauseRequest(ctx, execCfg, txn, progress) 260 }, 261 } 262 }) 263 } 264 265 func (rts *registryTestSuite) tearDown() { 266 close(rts.resumeCh) 267 close(rts.progressCh) 268 close(rts.resumeCheckCh) 269 close(rts.failOrCancelCh) 270 close(rts.failOrCancelCheckCh) 271 close(rts.done) 272 rts.s.Stopper().Stop(rts.ctx) 273 jobs.DefaultAdoptInterval = rts.oldInterval 274 jobs.ResetConstructors()() 275 } 276 277 func (rts *registryTestSuite) check(t *testing.T, expectedStatus jobs.Status) { 278 t.Helper() 279 opts := retry.Options{ 280 InitialBackoff: 5 * time.Millisecond, 281 MaxBackoff: time.Second, 282 Multiplier: 2, 283 } 284 if err := retry.WithMaxAttempts(rts.ctx, opts, 10, func() error { 285 rts.mu.Lock() 286 defer rts.mu.Unlock() 287 if diff := cmp.Diff(rts.mu.e, rts.mu.a); diff != "" { 288 return errors.Errorf("unexpected diff: %s", diff) 289 } 290 if expectedStatus == "" { 291 return nil 292 } 293 st, err := rts.job.CurrentStatus(rts.ctx) 294 if err != nil { 295 return err 296 } 297 if expectedStatus != st { 298 return errors.Errorf("expected job status: %s but got: %s", expectedStatus, st) 299 } 300 return nil 301 }); err != nil { 302 t.Fatal(err) 303 } 304 } 305 306 func TestRegistryLifecycle(t *testing.T) { 307 defer leaktest.AfterTest(t)() 308 t.Run("normal success", func(t *testing.T) { 309 rts := registryTestSuite{} 310 rts.setUp(t) 311 defer rts.tearDown() 312 313 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 314 if err != nil { 315 t.Fatal(err) 316 } 317 rts.job = j 318 319 rts.mu.e.ResumeStart = true 320 rts.resumeCheckCh <- struct{}{} 321 rts.check(t, jobs.StatusRunning) 322 323 rts.resumeCh <- nil 324 rts.mu.e.ResumeExit++ 325 rts.mu.e.Success = true 326 rts.check(t, jobs.StatusSucceeded) 327 }) 328 329 t.Run("create separately success", func(t *testing.T) { 330 rts := registryTestSuite{} 331 rts.setUp(t) 332 defer rts.tearDown() 333 334 j, err := rts.registry.CreateJobWithTxn(rts.ctx, rts.mockJob, nil) 335 if err != nil { 336 t.Fatal(err) 337 } 338 rts.job = j 339 340 rts.mu.e.ResumeStart = true 341 rts.check(t, jobs.StatusRunning) 342 343 rts.resumeCheckCh <- struct{}{} 344 rts.resumeCh <- nil 345 rts.mu.e.ResumeExit++ 346 rts.mu.e.Success = true 347 rts.check(t, jobs.StatusSucceeded) 348 }) 349 350 t.Run("pause running", func(t *testing.T) { 351 rts := registryTestSuite{} 352 rts.setUp(t) 353 defer rts.tearDown() 354 355 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 356 if err != nil { 357 t.Fatal(err) 358 } 359 rts.job = j 360 361 rts.mu.e.ResumeStart = true 362 rts.resumeCheckCh <- struct{}{} 363 rts.check(t, jobs.StatusRunning) 364 365 rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID()) 366 rts.check(t, jobs.StatusPaused) 367 368 rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID()) 369 rts.check(t, jobs.StatusPaused) 370 371 rts.sqlDB.Exec(t, "RESUME JOB $1", *j.ID()) 372 rts.check(t, jobs.StatusRunning) 373 374 rts.mu.e.ResumeStart = true 375 rts.resumeCheckCh <- struct{}{} 376 rts.check(t, jobs.StatusRunning) 377 rts.resumeCh <- nil 378 rts.mu.e.ResumeExit++ 379 380 rts.mu.e.Success = true 381 rts.check(t, jobs.StatusSucceeded) 382 }) 383 384 t.Run("pause reverting", func(t *testing.T) { 385 rts := registryTestSuite{} 386 rts.setUp(t) 387 defer rts.tearDown() 388 389 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 390 if err != nil { 391 t.Fatal(err) 392 } 393 rts.job = j 394 395 rts.mu.e.ResumeStart = true 396 rts.resumeCheckCh <- struct{}{} 397 rts.check(t, jobs.StatusRunning) 398 399 // Make Resume fail. 400 rts.resumeCh <- errors.New("resume failed") 401 rts.mu.e.ResumeExit++ 402 rts.mu.e.OnFailOrCancelStart = true 403 rts.failOrCancelCheckCh <- struct{}{} 404 rts.check(t, jobs.StatusReverting) 405 406 rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID()) 407 rts.check(t, jobs.StatusPaused) 408 409 rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID()) 410 rts.check(t, jobs.StatusPaused) 411 412 rts.sqlDB.Exec(t, "RESUME JOB $1", *j.ID()) 413 rts.failOrCancelCheckCh <- struct{}{} 414 rts.check(t, jobs.StatusReverting) 415 416 rts.failOrCancelCh <- nil 417 rts.mu.e.OnFailOrCancelExit++ 418 rts.check(t, jobs.StatusFailed) 419 }) 420 421 t.Run("cancel running", func(t *testing.T) { 422 rts := registryTestSuite{} 423 rts.setUp(t) 424 defer rts.tearDown() 425 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 426 if err != nil { 427 t.Fatal(err) 428 } 429 rts.job = j 430 431 rts.mu.e.ResumeStart = true 432 rts.resumeCheckCh <- struct{}{} 433 rts.check(t, jobs.StatusRunning) 434 435 rts.sqlDB.Exec(t, "CANCEL JOB $1", *j.ID()) 436 rts.mu.e.OnFailOrCancelStart = true 437 rts.check(t, jobs.StatusReverting) 438 439 rts.failOrCancelCheckCh <- struct{}{} 440 rts.failOrCancelCh <- nil 441 rts.mu.e.OnFailOrCancelExit++ 442 443 rts.check(t, jobs.StatusCanceled) 444 }) 445 446 t.Run("cancel reverting", func(t *testing.T) { 447 rts := registryTestSuite{} 448 rts.setUp(t) 449 defer rts.tearDown() 450 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 451 if err != nil { 452 t.Fatal(err) 453 } 454 rts.job = j 455 456 rts.mu.e.ResumeStart = true 457 rts.resumeCheckCh <- struct{}{} 458 rts.check(t, jobs.StatusRunning) 459 460 rts.sqlDB.Exec(t, "CANCEL JOB $1", *j.ID()) 461 rts.mu.e.OnFailOrCancelStart = true 462 rts.check(t, jobs.StatusReverting) 463 464 rts.sqlDB.ExpectErr(t, "status reverting cannot be requested to be canceled", "CANCEL JOB $1", *j.ID()) 465 rts.check(t, jobs.StatusReverting) 466 }) 467 468 t.Run("cancel pause running", func(t *testing.T) { 469 rts := registryTestSuite{} 470 rts.setUp(t) 471 defer rts.tearDown() 472 473 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 474 if err != nil { 475 t.Fatal(err) 476 } 477 rts.job = j 478 479 rts.mu.e.ResumeStart = true 480 rts.resumeCheckCh <- struct{}{} 481 rts.check(t, jobs.StatusRunning) 482 483 rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID()) 484 rts.check(t, jobs.StatusPaused) 485 486 rts.sqlDB.Exec(t, "CANCEL JOB $1", *j.ID()) 487 rts.mu.e.OnFailOrCancelStart = true 488 rts.failOrCancelCheckCh <- struct{}{} 489 rts.check(t, jobs.StatusReverting) 490 491 rts.failOrCancelCh <- nil 492 rts.mu.e.OnFailOrCancelExit++ 493 rts.check(t, jobs.StatusCanceled) 494 }) 495 496 t.Run("cancel pause reverting", func(t *testing.T) { 497 rts := registryTestSuite{} 498 rts.setUp(t) 499 defer rts.tearDown() 500 501 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 502 if err != nil { 503 t.Fatal(err) 504 } 505 rts.job = j 506 507 rts.mu.e.ResumeStart = true 508 rts.resumeCheckCh <- struct{}{} 509 rts.check(t, jobs.StatusRunning) 510 511 // Make Resume fail. 512 rts.resumeCh <- errors.New("resume failed") 513 rts.mu.e.ResumeExit++ 514 rts.mu.e.OnFailOrCancelStart = true 515 rts.failOrCancelCheckCh <- struct{}{} 516 rts.check(t, jobs.StatusReverting) 517 518 rts.sqlDB.Exec(t, "PAUSE JOB $1", *j.ID()) 519 rts.check(t, jobs.StatusPaused) 520 521 rts.sqlDB.ExpectErr(t, "paused and has non-nil FinalResumeError resume", "CANCEL JOB $1", *j.ID()) 522 rts.check(t, jobs.StatusPaused) 523 524 rts.sqlDB.Exec(t, "RESUME JOB $1", *j.ID()) 525 rts.failOrCancelCheckCh <- struct{}{} 526 rts.check(t, jobs.StatusReverting) 527 528 rts.failOrCancelCh <- nil 529 rts.mu.e.OnFailOrCancelExit++ 530 rts.check(t, jobs.StatusFailed) 531 }) 532 533 // Verify that pause and cancel in a rollback do nothing. 534 t.Run("rollback", func(t *testing.T) { 535 rts := registryTestSuite{} 536 rts.setUp(t) 537 defer rts.tearDown() 538 job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 539 if err != nil { 540 t.Fatal(err) 541 } 542 rts.job = job 543 544 rts.mu.e.ResumeStart = true 545 rts.resumeCheckCh <- struct{}{} 546 rts.check(t, jobs.StatusRunning) 547 548 // Rollback a CANCEL. 549 { 550 txn, err := rts.outerDB.Begin() 551 if err != nil { 552 t.Fatal(err) 553 } 554 // OnFailOrCancel is *not* called in the same txn as the job is marked 555 // cancel-requested and it will only be called when the job is adopted 556 // again. 557 if _, err := txn.Exec("CANCEL JOB $1", *job.ID()); err != nil { 558 t.Fatal(err) 559 } 560 if err := txn.Rollback(); err != nil { 561 t.Fatal(err) 562 } 563 rts.progressCh <- struct{}{} 564 rts.resumeCheckCh <- struct{}{} 565 rts.check(t, jobs.StatusRunning) 566 } 567 // Rollback a PAUSE. 568 { 569 txn, err := rts.outerDB.Begin() 570 if err != nil { 571 t.Fatal(err) 572 } 573 if _, err := txn.Exec("PAUSE JOB $1", *job.ID()); err != nil { 574 t.Fatal(err) 575 } 576 if err := txn.Rollback(); err != nil { 577 t.Fatal(err) 578 } 579 rts.progressCh <- struct{}{} 580 rts.resumeCheckCh <- struct{}{} 581 rts.check(t, jobs.StatusRunning) 582 } 583 // Now pause it for reals. 584 { 585 txn, err := rts.outerDB.Begin() 586 if err != nil { 587 t.Fatal(err) 588 } 589 if _, err := txn.Exec("PAUSE JOB $1", *job.ID()); err != nil { 590 t.Fatal(err) 591 } 592 // Not committed yet, so state shouldn't have changed. 593 // Don't check status in txn. 594 rts.check(t, "") 595 if err := txn.Commit(); err != nil { 596 t.Fatal(err) 597 } 598 rts.check(t, jobs.StatusPaused) 599 } 600 // Rollback a RESUME. 601 { 602 txn, err := rts.outerDB.Begin() 603 if err != nil { 604 t.Fatal(err) 605 } 606 if _, err := txn.Exec("RESUME JOB $1", *job.ID()); err != nil { 607 t.Fatal(err) 608 } 609 if err := txn.Rollback(); err != nil { 610 t.Fatal(err) 611 } 612 rts.check(t, jobs.StatusPaused) 613 } 614 // Commit a RESUME. 615 { 616 txn, err := rts.outerDB.Begin() 617 if err != nil { 618 t.Fatal(err) 619 } 620 if _, err := txn.Exec("RESUME JOB $1", *job.ID()); err != nil { 621 t.Fatal(err) 622 } 623 // Not committed yet, so state shouldn't have changed. 624 // Don't check status in txn. 625 rts.check(t, "") 626 if err := txn.Commit(); err != nil { 627 t.Fatal(err) 628 } 629 } 630 rts.mu.e.ResumeStart = true 631 rts.check(t, jobs.StatusRunning) 632 rts.resumeCheckCh <- struct{}{} 633 rts.resumeCh <- nil 634 rts.mu.e.ResumeExit++ 635 rts.mu.e.Success = true 636 rts.check(t, jobs.StatusSucceeded) 637 }) 638 639 t.Run("failed running", func(t *testing.T) { 640 rts := registryTestSuite{} 641 rts.setUp(t) 642 defer rts.tearDown() 643 644 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 645 if err != nil { 646 t.Fatal(err) 647 } 648 rts.job = j 649 650 rts.mu.e.ResumeStart = true 651 rts.check(t, jobs.StatusRunning) 652 653 rts.resumeCheckCh <- struct{}{} 654 rts.resumeCh <- errors.New("resume failed") 655 rts.mu.e.ResumeExit++ 656 rts.mu.e.OnFailOrCancelStart = true 657 rts.failOrCancelCheckCh <- struct{}{} 658 rts.check(t, jobs.StatusReverting) 659 660 rts.failOrCancelCh <- nil 661 rts.mu.e.OnFailOrCancelExit++ 662 rts.check(t, jobs.StatusFailed) 663 }) 664 665 // Attempt to mark success, but fail, but fail that also. 666 t.Run("fail marking success and fail OnFailOrCancel", func(t *testing.T) { 667 rts := registryTestSuite{} 668 rts.setUp(t) 669 defer rts.tearDown() 670 671 // Make marking success fail. 672 rts.successErr = errors.New("resume failed") 673 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 674 if err != nil { 675 t.Fatal(err) 676 } 677 rts.job = j 678 679 rts.mu.e.ResumeStart = true 680 rts.resumeCheckCh <- struct{}{} 681 rts.check(t, jobs.StatusRunning) 682 683 rts.resumeCh <- nil 684 rts.mu.e.ResumeExit++ 685 rts.mu.e.Success = true 686 rts.mu.e.OnFailOrCancelStart = true 687 688 // The job is now in state reverting and will never resume again because 689 // OnFailOrCancel also fails. 690 rts.check(t, jobs.StatusReverting) 691 rts.failOrCancelCheckCh <- struct{}{} 692 rts.mu.e.OnFailOrCancelExit++ 693 rts.failOrCancelCh <- errors.New("reverting failed") 694 rts.check(t, jobs.StatusFailed) 695 }) 696 697 // Fail the job, but also fail to mark it failed. 698 t.Run("fail marking failed", func(t *testing.T) { 699 rts := registryTestSuite{} 700 rts.setUp(t) 701 defer rts.tearDown() 702 703 // Make marking success fail. 704 rts.successErr = errors.New("resume failed") 705 j, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 706 if err != nil { 707 t.Fatal(err) 708 } 709 rts.job = j 710 711 rts.mu.e.ResumeStart = true 712 rts.resumeCheckCh <- struct{}{} 713 rts.check(t, jobs.StatusRunning) 714 715 rts.resumeCh <- errors.New("resume failed") 716 rts.mu.e.ResumeExit++ 717 rts.mu.e.OnFailOrCancelStart = true 718 rts.failOrCancelCheckCh <- struct{}{} 719 // The job is now in state reverting and will never resume again. 720 rts.check(t, jobs.StatusReverting) 721 722 // But let it fail. 723 rts.mu.e.OnFailOrCancelExit++ 724 rts.failOrCancelCh <- errors.New("resume failed") 725 rts.check(t, jobs.StatusFailed) 726 }) 727 728 t.Run("fail 2.0 jobs with no progress", func(t *testing.T) { 729 rts := registryTestSuite{} 730 rts.setUp(t) 731 defer rts.tearDown() 732 job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 733 if err != nil { 734 t.Fatal(err) 735 } 736 rts.job = job 737 rts.mu.e.ResumeStart = true 738 rts.check(t, jobs.StatusRunning) 739 rts.sqlDB.Exec(t, "PAUSE JOB $1", *job.ID()) 740 rts.sqlDB.Exec(t, `UPDATE system.jobs SET progress = NULL, status = $2 WHERE id = $1`, *job.ID(), jobs.StatusRunning) 741 testutils.SucceedsSoon(t, func() error { 742 var status jobs.Status 743 var err string 744 rts.sqlDB.QueryRow(t, `SELECT error, status FROM [SHOW JOBS] WHERE job_id = $1`, *job.ID()).Scan(&err, &status) 745 if status != jobs.StatusFailed { 746 return errors.Errorf("unexpected status: %s", status) 747 } 748 if err != "job predates cluster upgrade and must be re-run" { 749 return errors.Errorf("unexpected error: %s", err) 750 } 751 return nil 752 }) 753 rts.check(t, jobs.StatusFailed) 754 }) 755 756 t.Run("OnPauseRequest", func(t *testing.T) { 757 rts := registryTestSuite{} 758 rts.setUp(t) 759 defer rts.tearDown() 760 madeUpSpans := []roachpb.Span{ 761 {Key: roachpb.Key("foo")}, 762 } 763 rts.onPauseRequest = func(ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress) error { 764 progress.GetImport().SpanProgress = madeUpSpans 765 return nil 766 } 767 768 job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 769 require.NoError(t, err) 770 rts.job = job 771 772 rts.resumeCheckCh <- struct{}{} 773 rts.mu.e.ResumeStart = true 774 rts.check(t, jobs.StatusRunning) 775 776 // Request that the job is paused. 777 pauseErrCh := make(chan error) 778 go func() { 779 _, err := rts.outerDB.Exec("PAUSE JOB $1", *job.ID()) 780 pauseErrCh <- err 781 }() 782 783 // Ensure that the pause went off without a problem. 784 require.NoError(t, <-pauseErrCh) 785 rts.check(t, jobs.StatusPaused) 786 { 787 // Make sure the side-effects of our pause function occurred. 788 j, err := rts.registry.LoadJob(rts.ctx, *job.ID()) 789 require.NoError(t, err) 790 progress := j.Progress() 791 require.Equal(t, madeUpSpans, progress.GetImport().SpanProgress) 792 } 793 }) 794 t.Run("OnPauseRequest failure does not pause", func(t *testing.T) { 795 rts := registryTestSuite{} 796 rts.setUp(t) 797 defer rts.tearDown() 798 799 rts.onPauseRequest = func(ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress) error { 800 return errors.New("boom") 801 } 802 803 job, _, err := rts.registry.CreateAndStartJob(rts.ctx, nil, rts.mockJob) 804 require.NoError(t, err) 805 rts.job = job 806 807 // Allow the job to start. 808 rts.resumeCheckCh <- struct{}{} 809 rts.mu.e.ResumeStart = true 810 rts.check(t, jobs.StatusRunning) 811 812 // Request that the job is paused and ensure that the pause hit the error 813 // and failed to pause. 814 _, err = rts.outerDB.Exec("PAUSE JOB $1", *job.ID()) 815 require.Regexp(t, "boom", err) 816 817 // Allow the job to complete. 818 rts.resumeCh <- nil 819 rts.mu.e.Success = true 820 rts.mu.e.ResumeExit++ 821 rts.check(t, jobs.StatusSucceeded) 822 }) 823 } 824 825 func TestJobLifecycle(t *testing.T) { 826 defer leaktest.AfterTest(t)() 827 defer jobs.ResetConstructors()() 828 829 ctx := context.Background() 830 831 s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) 832 defer s.Stopper().Stop(ctx) 833 834 registry := s.JobRegistry().(*jobs.Registry) 835 836 createJob := func(record jobs.Record) (*jobs.Job, expectation) { 837 beforeTime := timeutil.Now() 838 job := registry.NewJob(record) 839 if err := job.Created(ctx); err != nil { 840 t.Fatal(err) 841 } 842 payload := job.Payload() 843 return job, expectation{ 844 DB: sqlDB, 845 Record: record, 846 Type: payload.Type(), 847 Before: beforeTime, 848 } 849 } 850 851 defaultRecord := jobs.Record{ 852 // Job does not accept an empty Details field, so arbitrarily provide 853 // ImportDetails. 854 Details: jobspb.ImportDetails{}, 855 Progress: jobspb.ImportProgress{}, 856 } 857 858 createDefaultJob := func() (*jobs.Job, expectation) { 859 return createJob(defaultRecord) 860 } 861 862 done := make(chan struct{}) 863 defer close(done) 864 865 jobs.RegisterConstructor(jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { 866 return jobs.FakeResumer{ 867 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 868 select { 869 case <-ctx.Done(): 870 return ctx.Err() 871 case <-done: 872 return nil 873 } 874 }, 875 } 876 }) 877 878 startLeasedJob := func(t *testing.T, record jobs.Record) (*jobs.Job, expectation) { 879 beforeTime := timeutil.Now() 880 job, _, err := registry.CreateAndStartJob(ctx, nil, record) 881 if err != nil { 882 t.Fatal(err) 883 } 884 payload := job.Payload() 885 return job, expectation{ 886 DB: sqlDB, 887 Record: record, 888 Type: payload.Type(), 889 Before: beforeTime, 890 } 891 } 892 893 t.Run("valid job lifecycles succeed", func(t *testing.T) { 894 // Woody is a successful job. 895 woodyJob, woodyExp := createJob(jobs.Record{ 896 Description: "There's a snake in my boot!", 897 Username: "Woody Pride", 898 DescriptorIDs: []sqlbase.ID{1, 2, 3}, 899 Details: jobspb.RestoreDetails{}, 900 Progress: jobspb.RestoreProgress{}, 901 }) 902 903 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil { 904 t.Fatal(err) 905 } 906 907 if err := woodyJob.Started(ctx); err != nil { 908 t.Fatal(err) 909 } 910 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil { 911 t.Fatal(err) 912 } 913 914 // This fraction completed progression tests that calling Progressed with a 915 // fractionCompleted that is less than the last-recorded fractionCompleted 916 // is observed. 917 progresses := []struct { 918 actual float32 919 expected float32 920 }{ 921 {0.0, 0.0}, {0.5, 0.5}, {0.5, 0.5}, {0.4, 0.4}, {0.8, 0.8}, {1.0, 1.0}, 922 } 923 for _, f := range progresses { 924 if err := woodyJob.FractionProgressed(ctx, jobs.FractionUpdater(f.actual)); err != nil { 925 t.Fatal(err) 926 } 927 woodyExp.FractionCompleted = f.expected 928 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil { 929 t.Fatal(err) 930 } 931 } 932 933 // Test Progressed callbacks. 934 if err := woodyJob.FractionProgressed(ctx, func(_ context.Context, details jobspb.ProgressDetails) float32 { 935 details.(*jobspb.Progress_Restore).Restore.HighWater = roachpb.Key("mariana") 936 return 1.0 937 }); err != nil { 938 t.Fatal(err) 939 } 940 woodyExp.Record.Progress = jobspb.RestoreProgress{HighWater: roachpb.Key("mariana")} 941 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusRunning); err != nil { 942 t.Fatal(err) 943 } 944 945 if err := woodyJob.Succeeded(ctx); err != nil { 946 t.Fatal(err) 947 } 948 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusSucceeded); err != nil { 949 t.Fatal(err) 950 } 951 952 // Buzz fails after it starts running. 953 buzzRecord := jobs.Record{ 954 Description: "To infinity and beyond!", 955 Username: "Buzz Lightyear", 956 DescriptorIDs: []sqlbase.ID{3, 2, 1}, 957 Details: jobspb.BackupDetails{}, 958 Progress: jobspb.BackupProgress{}, 959 } 960 buzzExp := expectation{ 961 DB: sqlDB, 962 Record: buzzRecord, 963 Type: jobspb.TypeBackup, 964 Before: timeutil.Now(), 965 Error: "Buzz Lightyear can't fly", 966 } 967 buzzJob := registry.NewJob(buzzRecord) 968 969 if err := buzzJob.Created(ctx); err != nil { 970 t.Fatal(err) 971 } 972 if err := buzzExp.verify(buzzJob.ID(), jobs.StatusRunning); err != nil { 973 t.Fatal(err) 974 } 975 976 if err := buzzJob.Started(ctx); err != nil { 977 t.Fatal(err) 978 } 979 if err := buzzExp.verify(buzzJob.ID(), jobs.StatusRunning); err != nil { 980 t.Fatal(err) 981 } 982 983 if err := buzzJob.FractionProgressed(ctx, jobs.FractionUpdater(.42)); err != nil { 984 t.Fatal(err) 985 } 986 buzzExp.FractionCompleted = .42 987 if err := buzzExp.verify(buzzJob.ID(), jobs.StatusRunning); err != nil { 988 t.Fatal(err) 989 } 990 991 if err := buzzJob.Failed(ctx, errors.New("Buzz Lightyear can't fly")); err != nil { 992 t.Fatal(err) 993 } 994 if err := buzzExp.verify(buzzJob.ID(), jobs.StatusFailed); err != nil { 995 t.Fatal(err) 996 } 997 998 // Ensure that logging Buzz didn't corrupt Woody. 999 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusSucceeded); err != nil { 1000 t.Fatal(err) 1001 } 1002 1003 // Sid fails before it starts running. 1004 sidJob, sidExp := createJob(jobs.Record{ 1005 Description: "The toys! The toys are alive!", 1006 Username: "Sid Phillips", 1007 DescriptorIDs: []sqlbase.ID{6, 6, 6}, 1008 Details: jobspb.RestoreDetails{}, 1009 Progress: jobspb.RestoreProgress{}, 1010 }) 1011 1012 if err := sidExp.verify(sidJob.ID(), jobs.StatusRunning); err != nil { 1013 t.Fatal(err) 1014 } 1015 1016 if err := sidJob.Failed(ctx, errors.New("Sid is a total failure")); err != nil { 1017 t.Fatal(err) 1018 } 1019 sidExp.Error = "Sid is a total failure" 1020 if err := sidExp.verify(sidJob.ID(), jobs.StatusFailed); err != nil { 1021 t.Fatal(err) 1022 } 1023 1024 // Ensure that logging Sid didn't corrupt Woody or Buzz. 1025 if err := woodyExp.verify(woodyJob.ID(), jobs.StatusSucceeded); err != nil { 1026 t.Fatal(err) 1027 } 1028 if err := buzzExp.verify(buzzJob.ID(), jobs.StatusFailed); err != nil { 1029 t.Fatal(err) 1030 } 1031 }) 1032 1033 t.Run("FinishedWith", func(t *testing.T) { 1034 t.Run("nil error marks job as successful", func(t *testing.T) { 1035 job, exp := createDefaultJob() 1036 exp.FractionCompleted = 1.0 1037 if err := job.Started(ctx); err != nil { 1038 t.Fatal(err) 1039 } 1040 if err := job.Succeeded(ctx); err != nil { 1041 t.Fatal(err) 1042 } 1043 if err := exp.verify(job.ID(), jobs.StatusSucceeded); err != nil { 1044 t.Fatal(err) 1045 } 1046 }) 1047 1048 t.Run("non-nil error marks job as failed", func(t *testing.T) { 1049 job, exp := createDefaultJob() 1050 boom := errors.New("boom") 1051 exp.Error = boom.Error() 1052 if err := job.Failed(ctx, boom); err != nil { 1053 t.Fatal(err) 1054 } 1055 if err := exp.verify(job.ID(), jobs.StatusFailed); err != nil { 1056 t.Fatal(err) 1057 } 1058 }) 1059 1060 t.Run("internal errors are not swallowed if marking job as successful", func(t *testing.T) { 1061 job, _ := createDefaultJob() 1062 if _, err := sqlDB.Exec( 1063 `UPDATE system.jobs SET payload = 'garbage' WHERE id = $1`, *job.ID(), 1064 ); err != nil { 1065 t.Fatal(err) 1066 } 1067 if err := job.Succeeded(ctx); !testutils.IsError(err, "wrong wireType") { 1068 t.Fatalf("unexpected: %v", err) 1069 } 1070 }) 1071 1072 t.Run("internal errors are not swallowed if marking job as failed", func(t *testing.T) { 1073 job, _ := createDefaultJob() 1074 if _, err := sqlDB.Exec( 1075 `UPDATE system.jobs SET payload = 'garbage' WHERE id = $1`, *job.ID(), 1076 ); err != nil { 1077 t.Fatal(err) 1078 } 1079 if err := job.Failed(ctx, errors.New("boom")); !testutils.IsError(err, "wrong wireType") { 1080 t.Fatalf("unexpected: %v", err) 1081 } 1082 }) 1083 }) 1084 1085 t.Run("cancelable jobs can be paused until finished", func(t *testing.T) { 1086 job, exp := startLeasedJob(t, defaultRecord) 1087 1088 if err := registry.PauseRequested(ctx, nil, *job.ID()); err != nil { 1089 t.Fatal(err) 1090 } 1091 if err := job.Paused(ctx); err != nil { 1092 t.Fatal(err) 1093 } 1094 if err := exp.verify(job.ID(), jobs.StatusPaused); err != nil { 1095 t.Fatal(err) 1096 } 1097 if err := registry.Resume(ctx, nil, *job.ID()); err != nil { 1098 t.Fatal(err) 1099 } 1100 // Resume the job again to ensure that the resumption is idempotent. 1101 if err := registry.Resume(ctx, nil, *job.ID()); err != nil { 1102 t.Fatal(err) 1103 } 1104 if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil { 1105 t.Fatal(err) 1106 } 1107 1108 // PauseRequested fails after job is successful. 1109 if err := job.Succeeded(ctx); err != nil { 1110 t.Fatal(err) 1111 } 1112 if err := registry.PauseRequested(ctx, nil, *job.ID()); !testutils.IsError(err, "cannot be requested to be paused") { 1113 t.Fatalf("expected 'cannot pause succeeded job', but got '%s'", err) 1114 } 1115 }) 1116 1117 t.Run("cancelable jobs can be canceled until finished", func(t *testing.T) { 1118 { 1119 job, exp := startLeasedJob(t, defaultRecord) 1120 if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil { 1121 t.Fatal(err) 1122 } 1123 if err := exp.verify(job.ID(), jobs.StatusCancelRequested); err != nil { 1124 t.Fatal(err) 1125 } 1126 } 1127 1128 { 1129 job, exp := startLeasedJob(t, defaultRecord) 1130 if err := job.Started(ctx); err != nil { 1131 t.Fatal(err) 1132 } 1133 if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil { 1134 t.Fatal(err) 1135 } 1136 if err := exp.verify(job.ID(), jobs.StatusCancelRequested); err != nil { 1137 t.Fatal(err) 1138 } 1139 } 1140 1141 { 1142 job, exp := startLeasedJob(t, defaultRecord) 1143 if err := registry.PauseRequested(ctx, nil, *job.ID()); err != nil { 1144 t.Fatal(err) 1145 } 1146 if err := job.Paused(ctx); err != nil { 1147 t.Fatal(err) 1148 } 1149 if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil { 1150 t.Fatal(err) 1151 } 1152 if err := exp.verify(job.ID(), jobs.StatusCancelRequested); err != nil { 1153 t.Fatal(err) 1154 } 1155 } 1156 1157 { 1158 job, _ := startLeasedJob(t, defaultRecord) 1159 if err := job.Succeeded(ctx); err != nil { 1160 t.Fatal(err) 1161 } 1162 expectedErr := "job with status succeeded cannot be requested to be canceled" 1163 if err := registry.CancelRequested(ctx, nil, *job.ID()); !testutils.IsError(err, expectedErr) { 1164 t.Fatalf("expected '%s', but got '%s'", expectedErr, err) 1165 } 1166 } 1167 }) 1168 1169 t.Run("unpaused jobs cannot be resumed", func(t *testing.T) { 1170 { 1171 job, _ := startLeasedJob(t, defaultRecord) 1172 if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil { 1173 t.Fatal(err) 1174 } 1175 if err := registry.Resume(ctx, nil, *job.ID()); !testutils.IsError(err, "cannot be resumed") { 1176 t.Errorf("got unexpected status '%v'", err) 1177 } 1178 } 1179 1180 { 1181 job, _ := startLeasedJob(t, defaultRecord) 1182 if err := job.Succeeded(ctx); err != nil { 1183 t.Fatal(err) 1184 } 1185 expectedErr := fmt.Sprintf("job with status %s cannot be resumed", jobs.StatusSucceeded) 1186 if err := registry.Resume(ctx, nil, *job.ID()); !testutils.IsError(err, expectedErr) { 1187 t.Errorf("expected '%s', but got '%v'", expectedErr, err) 1188 } 1189 } 1190 }) 1191 1192 t.Run("bad job details fail", func(t *testing.T) { 1193 defer func() { 1194 if r, ok := recover().(string); !ok || !strings.Contains(r, "unknown details type int") { 1195 t.Fatalf("expected 'unknown details type int', but got: %v", r) 1196 } 1197 }() 1198 1199 job := registry.NewJob(jobs.Record{ 1200 Details: 42, 1201 }) 1202 _ = job.Created(ctx) 1203 }) 1204 1205 t.Run("update before create fails", func(t *testing.T) { 1206 job := registry.NewJob(jobs.Record{ 1207 Details: jobspb.RestoreDetails{}, 1208 Progress: jobspb.RestoreProgress{}, 1209 }) 1210 if err := job.Started(ctx); !testutils.IsError(err, "job not created") { 1211 t.Fatalf("expected 'job not created' error, but got %v", err) 1212 } 1213 }) 1214 1215 t.Run("same state transition twice succeeds silently", func(t *testing.T) { 1216 job, _ := createDefaultJob() 1217 if err := job.Started(ctx); err != nil { 1218 t.Fatal(err) 1219 } 1220 if err := job.Started(ctx); err != nil { 1221 t.Fatal(err) 1222 } 1223 if err := job.Succeeded(ctx); err != nil { 1224 t.Fatal(err) 1225 } 1226 if err := job.Succeeded(ctx); err != nil { 1227 t.Fatal(err) 1228 } 1229 }) 1230 1231 t.Run("high-water progress works", func(t *testing.T) { 1232 job, _ := createDefaultJob() 1233 if err := job.Started(ctx); err != nil { 1234 t.Fatal(err) 1235 } 1236 highWaters := []hlc.Timestamp{ 1237 {WallTime: 1, Logical: 1}, 1238 {WallTime: 2, Logical: 0}, 1239 } 1240 for _, ts := range highWaters { 1241 if err := job.HighWaterProgressed( 1242 ctx, func(context.Context, *kv.Txn, jobspb.ProgressDetails) (hlc.Timestamp, error) { return ts, nil }, 1243 ); err != nil { 1244 t.Fatal(err) 1245 } 1246 p := job.Progress() 1247 if actual := *p.GetHighWater(); actual != ts { 1248 t.Fatalf(`got %s expected %s`, actual, ts) 1249 } 1250 } 1251 }) 1252 1253 t.Run("out of bounds progress fails", func(t *testing.T) { 1254 job, _ := createDefaultJob() 1255 if err := job.Started(ctx); err != nil { 1256 t.Fatal(err) 1257 } 1258 if err := job.FractionProgressed(ctx, jobs.FractionUpdater(-0.1)); !testutils.IsError(err, "outside allowable range") { 1259 t.Fatalf("expected 'outside allowable range' error, but got %v", err) 1260 } 1261 if err := job.FractionProgressed(ctx, jobs.FractionUpdater(1.1)); !testutils.IsError(err, "outside allowable range") { 1262 t.Fatalf("expected 'outside allowable range' error, but got %v", err) 1263 } 1264 if err := job.HighWaterProgressed( 1265 ctx, func(context.Context, *kv.Txn, jobspb.ProgressDetails) (hlc.Timestamp, error) { 1266 return hlc.Timestamp{WallTime: -1}, nil 1267 }, 1268 ); !testutils.IsError(err, "outside allowable range") { 1269 t.Fatalf("expected 'outside allowable range' error, but got %v", err) 1270 } 1271 }) 1272 1273 t.Run("error propagates", func(t *testing.T) { 1274 job, _ := createDefaultJob() 1275 if err := job.Started(ctx); err != nil { 1276 t.Fatal(err) 1277 } 1278 if err := job.HighWaterProgressed( 1279 ctx, func(context.Context, *kv.Txn, jobspb.ProgressDetails) (hlc.Timestamp, error) { 1280 return hlc.Timestamp{WallTime: 2}, errors.Errorf("boom") 1281 }, 1282 ); !testutils.IsError(err, "boom") { 1283 t.Fatalf("expected 'boom' error, but got %v", err) 1284 } 1285 }) 1286 1287 t.Run("progress on finished job fails", func(t *testing.T) { 1288 job, _ := createDefaultJob() 1289 if err := job.Started(ctx); err != nil { 1290 t.Fatal(err) 1291 } 1292 if err := job.Succeeded(ctx); err != nil { 1293 t.Fatal(err) 1294 } 1295 if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.5)); !testutils.IsError( 1296 err, `cannot update progress on succeeded job \(id \d+\)`, 1297 ) { 1298 t.Fatalf("expected 'cannot update progress' error, but got %v", err) 1299 } 1300 }) 1301 1302 t.Run("progress on paused job fails", func(t *testing.T) { 1303 job, _ := startLeasedJob(t, defaultRecord) 1304 if err := registry.PauseRequested(ctx, nil, *job.ID()); err != nil { 1305 t.Fatal(err) 1306 } 1307 if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.5)); !testutils.IsError( 1308 err, `cannot update progress on pause-requested job`, 1309 ) { 1310 t.Fatalf("expected progress error, but got %v", err) 1311 } 1312 }) 1313 1314 t.Run("progress on canceled job fails", func(t *testing.T) { 1315 job, _ := startLeasedJob(t, defaultRecord) 1316 if err := registry.CancelRequested(ctx, nil, *job.ID()); err != nil { 1317 t.Fatal(err) 1318 } 1319 if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.5)); !testutils.IsError( 1320 err, `cannot update progress on cancel-requested job \(id \d+\)`, 1321 ) { 1322 t.Fatalf("expected progress error, but got %v", err) 1323 } 1324 }) 1325 1326 t.Run("succeeded forces fraction completed to 1.0", func(t *testing.T) { 1327 job, exp := createDefaultJob() 1328 if err := job.Started(ctx); err != nil { 1329 t.Fatal(err) 1330 } 1331 if err := job.FractionProgressed(ctx, jobs.FractionUpdater(0.2)); err != nil { 1332 t.Fatal(err) 1333 } 1334 if err := job.Succeeded(ctx); err != nil { 1335 t.Fatal(err) 1336 } 1337 exp.FractionCompleted = 1.0 1338 if err := exp.verify(job.ID(), jobs.StatusSucceeded); err != nil { 1339 t.Fatal(err) 1340 } 1341 }) 1342 1343 t.Run("set details works", func(t *testing.T) { 1344 job, exp := createJob(jobs.Record{ 1345 Details: jobspb.RestoreDetails{}, 1346 Progress: jobspb.RestoreProgress{}, 1347 }) 1348 if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil { 1349 t.Fatal(err) 1350 } 1351 newDetails := jobspb.RestoreDetails{URIs: []string{"new"}} 1352 exp.Record.Details = newDetails 1353 if err := job.SetDetails(ctx, newDetails); err != nil { 1354 t.Fatal(err) 1355 } 1356 if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil { 1357 t.Fatal(err) 1358 } 1359 }) 1360 1361 t.Run("set progress works", func(t *testing.T) { 1362 job, exp := createJob(jobs.Record{ 1363 Details: jobspb.RestoreDetails{}, 1364 Progress: jobspb.RestoreProgress{}, 1365 }) 1366 if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil { 1367 t.Fatal(err) 1368 } 1369 newDetails := jobspb.RestoreProgress{HighWater: []byte{42}} 1370 exp.Record.Progress = newDetails 1371 if err := job.SetProgress(ctx, newDetails); err != nil { 1372 t.Fatal(err) 1373 } 1374 if err := exp.verify(job.ID(), jobs.StatusRunning); err != nil { 1375 t.Fatal(err) 1376 } 1377 }) 1378 1379 } 1380 1381 // TestShowJobs manually inserts a row into system.jobs and checks that the 1382 // encoded protobuf payload is properly decoded and visible in 1383 // crdb_internal.jobs. 1384 func TestShowJobs(t *testing.T) { 1385 defer leaktest.AfterTest(t)() 1386 1387 params, _ := tests.CreateTestServerParams() 1388 s, rawSQLDB, _ := serverutils.StartServer(t, params) 1389 sqlDB := sqlutils.MakeSQLRunner(rawSQLDB) 1390 defer s.Stopper().Stop(context.Background()) 1391 1392 // row represents a row returned from crdb_internal.jobs, but 1393 // *not* a row in system.jobs. 1394 type row struct { 1395 id int64 1396 typ string 1397 status string 1398 description string 1399 username string 1400 err string 1401 created time.Time 1402 started time.Time 1403 finished time.Time 1404 modified time.Time 1405 fractionCompleted float32 1406 highWater hlc.Timestamp 1407 coordinatorID roachpb.NodeID 1408 details jobspb.Details 1409 } 1410 1411 for _, in := range []row{ 1412 { 1413 id: 42, 1414 typ: "SCHEMA CHANGE", 1415 status: "superfailed", 1416 description: "failjob", 1417 username: "failure", 1418 err: "boom", 1419 // lib/pq returns time.Time objects with goofy locations, which breaks 1420 // reflect.DeepEqual without this time.FixedZone song and dance. 1421 // See: https://github.com/lib/pq/issues/329 1422 created: timeutil.Unix(1, 0).In(time.FixedZone("", 0)), 1423 started: timeutil.Unix(2, 0).In(time.FixedZone("", 0)), 1424 finished: timeutil.Unix(3, 0).In(time.FixedZone("", 0)), 1425 modified: timeutil.Unix(4, 0).In(time.FixedZone("", 0)), 1426 fractionCompleted: 0.42, 1427 coordinatorID: 7, 1428 details: jobspb.SchemaChangeDetails{}, 1429 }, 1430 { 1431 id: 43, 1432 typ: "CHANGEFEED", 1433 status: "running", 1434 description: "persistent feed", 1435 username: "persistent", 1436 err: "", 1437 // lib/pq returns time.Time objects with goofy locations, which breaks 1438 // reflect.DeepEqual without this time.FixedZone song and dance. 1439 // See: https://github.com/lib/pq/issues/329 1440 created: timeutil.Unix(1, 0).In(time.FixedZone("", 0)), 1441 started: timeutil.Unix(2, 0).In(time.FixedZone("", 0)), 1442 finished: timeutil.Unix(3, 0).In(time.FixedZone("", 0)), 1443 modified: timeutil.Unix(4, 0).In(time.FixedZone("", 0)), 1444 highWater: hlc.Timestamp{ 1445 WallTime: 1533143242000000, 1446 Logical: 4, 1447 }, 1448 coordinatorID: 7, 1449 details: jobspb.ChangefeedDetails{}, 1450 }, 1451 } { 1452 t.Run("", func(t *testing.T) { 1453 // system.jobs is part proper SQL columns, part protobuf, so we can't use the 1454 // row struct directly. 1455 inPayload, err := protoutil.Marshal(&jobspb.Payload{ 1456 Description: in.description, 1457 StartedMicros: in.started.UnixNano() / time.Microsecond.Nanoseconds(), 1458 FinishedMicros: in.finished.UnixNano() / time.Microsecond.Nanoseconds(), 1459 Username: in.username, 1460 Lease: &jobspb.Lease{ 1461 NodeID: 7, 1462 }, 1463 Error: in.err, 1464 Details: jobspb.WrapPayloadDetails(in.details), 1465 }) 1466 if err != nil { 1467 t.Fatal(err) 1468 } 1469 1470 progress := &jobspb.Progress{ 1471 ModifiedMicros: in.modified.UnixNano() / time.Microsecond.Nanoseconds(), 1472 } 1473 if in.highWater != (hlc.Timestamp{}) { 1474 progress.Progress = &jobspb.Progress_HighWater{ 1475 HighWater: &in.highWater, 1476 } 1477 } else { 1478 progress.Progress = &jobspb.Progress_FractionCompleted{ 1479 FractionCompleted: in.fractionCompleted, 1480 } 1481 } 1482 inProgress, err := protoutil.Marshal(progress) 1483 if err != nil { 1484 t.Fatal(err) 1485 } 1486 sqlDB.Exec(t, 1487 `INSERT INTO system.jobs (id, status, created, payload, progress) VALUES ($1, $2, $3, $4, $5)`, 1488 in.id, in.status, in.created, inPayload, inProgress, 1489 ) 1490 1491 var out row 1492 var maybeFractionCompleted *float32 1493 var decimalHighWater *apd.Decimal 1494 sqlDB.QueryRow(t, ` 1495 SELECT job_id, job_type, status, created, description, started, finished, modified, 1496 fraction_completed, high_water_timestamp, user_name, ifnull(error, ''), coordinator_id 1497 FROM crdb_internal.jobs WHERE job_id = $1`, in.id).Scan( 1498 &out.id, &out.typ, &out.status, &out.created, &out.description, &out.started, 1499 &out.finished, &out.modified, &maybeFractionCompleted, &decimalHighWater, &out.username, 1500 &out.err, &out.coordinatorID, 1501 ) 1502 1503 if decimalHighWater != nil { 1504 var err error 1505 out.highWater, err = tree.DecimalToHLC(decimalHighWater) 1506 if err != nil { 1507 t.Fatal(err) 1508 } 1509 } 1510 1511 if maybeFractionCompleted != nil { 1512 out.fractionCompleted = *maybeFractionCompleted 1513 } 1514 1515 // details field is not explicitly checked for equality; its value is 1516 // confirmed via the job_type field, which is dependent on the details 1517 // field. 1518 out.details = in.details 1519 1520 if !reflect.DeepEqual(in, out) { 1521 diff := strings.Join(pretty.Diff(in, out), "\n") 1522 t.Fatalf("in job did not match out job:\n%s", diff) 1523 } 1524 }) 1525 } 1526 } 1527 1528 func TestShowAutomaticJobs(t *testing.T) { 1529 defer leaktest.AfterTest(t)() 1530 1531 params, _ := tests.CreateTestServerParams() 1532 s, rawSQLDB, _ := serverutils.StartServer(t, params) 1533 sqlDB := sqlutils.MakeSQLRunner(rawSQLDB) 1534 defer s.Stopper().Stop(context.Background()) 1535 1536 // row represents a row returned from crdb_internal.jobs, but 1537 // *not* a row in system.jobs. 1538 type row struct { 1539 id int64 1540 typ string 1541 status string 1542 details jobspb.Details 1543 } 1544 1545 rows := []row{ 1546 { 1547 id: 1, 1548 typ: "CREATE STATS", 1549 status: "running", 1550 details: jobspb.CreateStatsDetails{Name: "my_stats"}, 1551 }, 1552 { 1553 id: 2, 1554 typ: "AUTO CREATE STATS", 1555 status: "running", 1556 details: jobspb.CreateStatsDetails{Name: "__auto__"}, 1557 }, 1558 } 1559 1560 for _, in := range rows { 1561 // system.jobs is part proper SQL columns, part protobuf, so we can't use the 1562 // row struct directly. 1563 inPayload, err := protoutil.Marshal(&jobspb.Payload{ 1564 Details: jobspb.WrapPayloadDetails(in.details), 1565 }) 1566 if err != nil { 1567 t.Fatal(err) 1568 } 1569 1570 sqlDB.Exec(t, 1571 `INSERT INTO system.jobs (id, status, payload) VALUES ($1, $2, $3)`, 1572 in.id, in.status, inPayload, 1573 ) 1574 } 1575 1576 var out row 1577 1578 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOB 1]`).Scan(&out.id, &out.typ) 1579 if out.id != 1 || out.typ != "CREATE STATS" { 1580 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1581 1, "CREATE STATS", out.id, out.typ) 1582 } 1583 1584 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS SELECT 1]`).Scan(&out.id, &out.typ) 1585 if out.id != 1 || out.typ != "CREATE STATS" { 1586 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1587 1, "CREATE STATS", out.id, out.typ) 1588 } 1589 1590 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS (SELECT 1)]`).Scan(&out.id, &out.typ) 1591 if out.id != 1 || out.typ != "CREATE STATS" { 1592 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1593 1, "CREATE STATS", out.id, out.typ) 1594 } 1595 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOB 2]`).Scan(&out.id, &out.typ) 1596 if out.id != 2 || out.typ != "AUTO CREATE STATS" { 1597 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1598 2, "AUTO CREATE STATS", out.id, out.typ) 1599 } 1600 1601 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS SELECT 2]`).Scan(&out.id, &out.typ) 1602 if out.id != 2 || out.typ != "AUTO CREATE STATS" { 1603 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1604 2, "AUTO CREATE STATS", out.id, out.typ) 1605 } 1606 1607 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW JOBS]`).Scan(&out.id, &out.typ) 1608 if out.id != 1 || out.typ != "CREATE STATS" { 1609 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1610 1, "CREATE STATS", out.id, out.typ) 1611 } 1612 1613 sqlDB.QueryRow(t, `SELECT job_id, job_type FROM [SHOW AUTOMATIC JOBS]`).Scan(&out.id, &out.typ) 1614 if out.id != 2 || out.typ != "AUTO CREATE STATS" { 1615 t.Fatalf("Expected id:%d and type:%s but found id:%d and type:%s", 1616 2, "AUTO CREATE STATS", out.id, out.typ) 1617 } 1618 } 1619 1620 func TestShowJobsWithError(t *testing.T) { 1621 defer leaktest.AfterTest(t)() 1622 1623 params, _ := tests.CreateTestServerParams() 1624 s, sqlDB, _ := serverutils.StartServer(t, params) 1625 defer s.Stopper().Stop(context.Background()) 1626 1627 // Create at least 6 rows, ensuring 3 rows are corrupted. 1628 // Ensure there is at least one row in system.jobs. 1629 if _, err := sqlDB.Exec(` 1630 CREATE TABLE foo(x INT); ALTER TABLE foo ADD COLUMN y INT; 1631 `); err != nil { 1632 t.Fatal(err) 1633 } 1634 // Get the id of the ADD COLUMN job to use later. 1635 var jobID int64 1636 if err := sqlDB.QueryRow(`SELECT id FROM system.jobs ORDER BY id DESC LIMIT 1`).Scan(&jobID); err != nil { 1637 t.Fatal(err) 1638 } 1639 1640 // Now insert more rows based on the valid row, some of which are corrupted. 1641 if _, err := sqlDB.Exec(` 1642 -- Create a corrupted payload field from the most recent row. 1643 INSERT INTO system.jobs(id, status, payload, progress) SELECT id+1, status, '\xaaaa'::BYTES, progress FROM system.jobs WHERE id = $1; 1644 `, jobID); err != nil { 1645 t.Fatal(err) 1646 } 1647 if _, err := sqlDB.Exec(` 1648 -- Create a corrupted progress field. 1649 INSERT INTO system.jobs(id, status, payload, progress) SELECT id+2, status, payload, '\xaaaa'::BYTES FROM system.jobs WHERE id = $1; 1650 `, jobID); err != nil { 1651 t.Fatal(err) 1652 } 1653 if _, err := sqlDB.Exec(` 1654 -- Corrupt both fields. 1655 INSERT INTO system.jobs(id, status, payload, progress) SELECT id+3, status, '\xaaaa'::BYTES, '\xaaaa'::BYTES FROM system.jobs WHERE id = $1; 1656 `, jobID); err != nil { 1657 t.Fatal(err) 1658 } 1659 if _, err := sqlDB.Exec(` 1660 -- Test what happens with a NULL progress field (which is a valid value). 1661 INSERT INTO system.jobs(id, status, payload, progress) SELECT id+4, status, payload, NULL::BYTES FROM system.jobs WHERE id = $1; 1662 `, jobID); err != nil { 1663 t.Fatal(err) 1664 } 1665 if _, err := sqlDB.Exec(` 1666 -- Test what happens with a NULL progress field (which is a valid value). 1667 INSERT INTO system.jobs(id, status, payload, progress) SELECT id+5, status, '\xaaaa'::BYTES, NULL::BYTES FROM system.jobs WHERE id = $1; 1668 `, jobID); err != nil { 1669 t.Fatal(err) 1670 } 1671 1672 // Extract the last 6 rows from the query. 1673 rows, err := sqlDB.Query(` 1674 WITH a AS (SELECT job_id, description, fraction_completed, error FROM [SHOW JOBS] ORDER BY job_id DESC LIMIT 6) 1675 SELECT ifnull(description, 'NULL'), ifnull(fraction_completed, -1)::string, ifnull(error,'NULL') FROM a ORDER BY job_id ASC`) 1676 if err != nil { 1677 t.Fatal(err) 1678 } 1679 defer rows.Close() 1680 1681 var desc, frac, errStr string 1682 1683 // Valid row. 1684 rowNum := 0 1685 if !rows.Next() { 1686 t.Fatalf("%d too few rows", rowNum) 1687 } 1688 if err := rows.Scan(&desc, &frac, &errStr); err != nil { 1689 t.Fatalf("%d: %v", rowNum, err) 1690 } 1691 t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac) 1692 if desc == "NULL" || errStr != "" || frac[0] == '-' { 1693 t.Fatalf("%d: invalid row", rowNum) 1694 } 1695 rowNum++ 1696 1697 // Corrupted payload but valid progress. 1698 if !rows.Next() { 1699 t.Fatalf("%d: too few rows", rowNum) 1700 } 1701 if err := rows.Scan(&desc, &frac, &errStr); err != nil { 1702 t.Fatalf("%d: %v", rowNum, err) 1703 } 1704 t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac) 1705 if desc != "NULL" || !strings.HasPrefix(errStr, "error decoding payload") || frac[0] == '-' { 1706 t.Fatalf("%d: invalid row", rowNum) 1707 } 1708 rowNum++ 1709 1710 // Corrupted progress but valid payload. 1711 if !rows.Next() { 1712 t.Fatalf("%d: too few rows", rowNum) 1713 } 1714 if err := rows.Scan(&desc, &frac, &errStr); err != nil { 1715 t.Fatalf("%d: %v", rowNum, err) 1716 } 1717 t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac) 1718 if desc == "NULL" || !strings.HasPrefix(errStr, "error decoding progress") || frac[0] != '-' { 1719 t.Fatalf("%d: invalid row", rowNum) 1720 } 1721 rowNum++ 1722 1723 // Both payload and progress corrupted. 1724 if !rows.Next() { 1725 t.Fatalf("%d: too few rows", rowNum) 1726 } 1727 if err := rows.Scan(&desc, &frac, &errStr); err != nil { 1728 t.Fatalf("%d: %v", rowNum, err) 1729 } 1730 t.Logf("row: %q %q %v", desc, errStr, frac) 1731 if desc != "NULL" || 1732 !strings.Contains(errStr, "error decoding payload") || 1733 !strings.Contains(errStr, "error decoding progress") || 1734 frac[0] != '-' { 1735 t.Fatalf("%d: invalid row", rowNum) 1736 } 1737 rowNum++ 1738 1739 // Valid payload and missing progress. 1740 if !rows.Next() { 1741 t.Fatalf("%d too few rows", rowNum) 1742 } 1743 if err := rows.Scan(&desc, &frac, &errStr); err != nil { 1744 t.Fatalf("%d: %v", rowNum, err) 1745 } 1746 t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac) 1747 if desc == "NULL" || errStr != "" || frac[0] != '-' { 1748 t.Fatalf("%d: invalid row", rowNum) 1749 } 1750 rowNum++ 1751 1752 // Invalid payload and missing progress. 1753 if !rows.Next() { 1754 t.Fatalf("%d too few rows", rowNum) 1755 } 1756 if err := rows.Scan(&desc, &frac, &errStr); err != nil { 1757 t.Fatalf("%d: %v", rowNum, err) 1758 } 1759 t.Logf("row %d: %q %q %v", rowNum, desc, errStr, frac) 1760 if desc != "NULL" || 1761 !strings.Contains(errStr, "error decoding payload") || 1762 strings.Contains(errStr, "error decoding progress") || 1763 frac[0] != '-' { 1764 t.Fatalf("%d: invalid row", rowNum) 1765 } 1766 rowNum++ 1767 } 1768 1769 func TestShowJobWhenComplete(t *testing.T) { 1770 defer leaktest.AfterTest(t)() 1771 // Canceling a job relies on adopt daemon to move the job to state 1772 // reverting. 1773 defer func(oldInterval time.Duration) { 1774 jobs.DefaultAdoptInterval = oldInterval 1775 }(jobs.DefaultAdoptInterval) 1776 jobs.DefaultAdoptInterval = 10 * time.Millisecond 1777 ctx := context.Background() 1778 s, db, _ := serverutils.StartServer(t, base.TestServerArgs{}) 1779 defer s.Stopper().Stop(ctx) 1780 registry := s.JobRegistry().(*jobs.Registry) 1781 mockJob := jobs.Record{Details: jobspb.ImportDetails{}, Progress: jobspb.ImportProgress{}} 1782 done := make(chan struct{}) 1783 defer close(done) 1784 jobs.RegisterConstructor( 1785 jobspb.TypeImport, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { 1786 return jobs.FakeResumer{ 1787 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 1788 select { 1789 case <-ctx.Done(): 1790 return ctx.Err() 1791 case <-done: 1792 return nil 1793 } 1794 }, 1795 } 1796 }) 1797 1798 type row struct { 1799 id int64 1800 status string 1801 } 1802 var out row 1803 1804 t.Run("show job", func(t *testing.T) { 1805 // Start a job and cancel it so it is in state finished and then query it with 1806 // SHOW JOB WHEN COMPLETE. 1807 job, _, err := registry.CreateAndStartJob(ctx, nil, mockJob) 1808 if err != nil { 1809 t.Fatal(err) 1810 } 1811 group := ctxgroup.WithContext(ctx) 1812 group.GoCtx(func(ctx context.Context) error { 1813 if err := db.QueryRowContext( 1814 ctx, 1815 `SELECT job_id, status 1816 FROM [SHOW JOB WHEN COMPLETE $1]`, 1817 *job.ID()).Scan(&out.id, &out.status); err != nil { 1818 return err 1819 } 1820 if out.status != "canceled" { 1821 return errors.Errorf( 1822 "Expected status 'canceled' but got '%s'", out.status) 1823 } 1824 if *job.ID() != out.id { 1825 return errors.Errorf( 1826 "Expected job id %d but got %d", *job.ID(), out.id) 1827 } 1828 return nil 1829 }) 1830 // Give a chance for the above group to schedule in order to test that 1831 // SHOW JOBS WHEN COMPLETE does block until the job is canceled. 1832 time.Sleep(2 * time.Millisecond) 1833 if _, err = db.ExecContext(ctx, "CANCEL JOB $1", *job.ID()); err == nil { 1834 err = group.Wait() 1835 } 1836 if err != nil { 1837 t.Fatal(err) 1838 } 1839 }) 1840 t.Run("show jobs", func(t *testing.T) { 1841 // Start two jobs and cancel the first one to make sure the 1842 // query still blocks until the second job is also canceled. 1843 var jobs [2]*jobs.Job 1844 for i := range jobs { 1845 job, _, err := registry.CreateAndStartJob(ctx, nil, mockJob) 1846 if err != nil { 1847 t.Fatal(err) 1848 } 1849 jobs[i] = job 1850 } 1851 if _, err := db.ExecContext(ctx, "CANCEL JOB $1", *jobs[0].ID()); err != nil { 1852 t.Fatal(err) 1853 } 1854 group := ctxgroup.WithContext(ctx) 1855 group.GoCtx(func(ctx context.Context) error { 1856 rows, err := db.QueryContext(ctx, 1857 `SELECT job_id, status 1858 FROM [SHOW JOBS WHEN COMPLETE (SELECT $1 UNION SELECT $2)]`, 1859 *jobs[0].ID(), *jobs[1].ID()) 1860 if err != nil { 1861 return err 1862 } 1863 var cnt int 1864 for rows.Next() { 1865 if err := rows.Scan(&out.id, &out.status); err != nil { 1866 return err 1867 } 1868 cnt += 1 1869 switch out.id { 1870 case *jobs[0].ID(): 1871 case *jobs[1].ID(): 1872 // SHOW JOBS WHEN COMPLETE finishes only after all jobs are 1873 // canceled. 1874 if out.status != "canceled" { 1875 return errors.Errorf( 1876 "Expected status 'canceled' but got '%s'", 1877 out.status) 1878 } 1879 default: 1880 return errors.Errorf( 1881 "Expected either id:%d or id:%d but got: %d", 1882 *jobs[0].ID(), *jobs[1].ID(), out.id) 1883 } 1884 } 1885 if cnt != 2 { 1886 return errors.Errorf("Expected 2 results but found %d", cnt) 1887 } 1888 return nil 1889 }) 1890 // Give a chance for the above group to schedule in order to test that 1891 // SHOW JOBS WHEN COMPLETE does block until the job is canceled. 1892 time.Sleep(2 * time.Millisecond) 1893 var err error 1894 if _, err = db.ExecContext(ctx, "CANCEL JOB $1", *jobs[1].ID()); err == nil { 1895 err = group.Wait() 1896 } 1897 if err != nil { 1898 t.Fatal(err) 1899 } 1900 }) 1901 } 1902 1903 func TestJobInTxn(t *testing.T) { 1904 defer leaktest.AfterTest(t)() 1905 defer jobs.ResetConstructors()() 1906 1907 defer func(oldInterval time.Duration) { 1908 jobs.DefaultAdoptInterval = oldInterval 1909 }(jobs.DefaultAdoptInterval) 1910 jobs.DefaultAdoptInterval = 5 * time.Second 1911 1912 ctx := context.Background() 1913 s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) 1914 defer s.Stopper().Stop(ctx) 1915 1916 // Accessed atomically. 1917 var hasRun int32 1918 var job *jobs.Job 1919 1920 defer sql.ClearPlanHooks() 1921 // Piggy back on BACKUP to be able to create a succeeding test job. 1922 sql.AddPlanHook( 1923 func(_ context.Context, stmt tree.Statement, phs sql.PlanHookState, 1924 ) (sql.PlanHookRowFn, sqlbase.ResultColumns, []sql.PlanNode, bool, error) { 1925 st, ok := stmt.(*tree.Backup) 1926 if !ok { 1927 return nil, nil, nil, false, nil 1928 } 1929 fn := func(_ context.Context, _ []sql.PlanNode, _ chan<- tree.Datums) error { 1930 var err error 1931 job, err = phs.ExtendedEvalContext().QueueJob( 1932 jobs.Record{ 1933 Description: st.String(), 1934 Details: jobspb.BackupDetails{}, 1935 Progress: jobspb.BackupProgress{}, 1936 }, 1937 ) 1938 return err 1939 } 1940 return fn, nil, nil, false, nil 1941 }, 1942 ) 1943 jobs.RegisterConstructor(jobspb.TypeBackup, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { 1944 return jobs.FakeResumer{ 1945 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 1946 t.Logf("Resuming job: %+v", job.Payload()) 1947 atomic.AddInt32(&hasRun, 1) 1948 return nil 1949 }, 1950 FailOrCancel: func(ctx context.Context) error { 1951 atomic.AddInt32(&hasRun, 1) 1952 return nil 1953 }, 1954 } 1955 }) 1956 // Piggy back on RESTORE to be able to create a failing test job. 1957 sql.AddPlanHook( 1958 func(_ context.Context, stmt tree.Statement, phs sql.PlanHookState, 1959 ) (sql.PlanHookRowFn, sqlbase.ResultColumns, []sql.PlanNode, bool, error) { 1960 _, ok := stmt.(*tree.Restore) 1961 if !ok { 1962 return nil, nil, nil, false, nil 1963 } 1964 fn := func(_ context.Context, _ []sql.PlanNode, _ chan<- tree.Datums) error { 1965 var err error 1966 job, err = phs.ExtendedEvalContext().QueueJob( 1967 jobs.Record{ 1968 Description: "RESTORE", 1969 Details: jobspb.RestoreDetails{}, 1970 Progress: jobspb.RestoreProgress{}, 1971 }, 1972 ) 1973 return err 1974 } 1975 return fn, nil, nil, false, nil 1976 }, 1977 ) 1978 jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { 1979 return jobs.FakeResumer{ 1980 OnResume: func(_ context.Context, _ chan<- tree.Datums) error { 1981 return errors.New("RESTORE failed") 1982 }, 1983 } 1984 }) 1985 1986 t.Run("rollback txn", func(t *testing.T) { 1987 start := timeutil.Now() 1988 1989 txn, err := sqlDB.Begin() 1990 require.NoError(t, err) 1991 _, err = txn.Exec("BACKUP tobeaborted TO doesnotmattter") 1992 require.NoError(t, err) 1993 1994 // If we rollback then the job should not run 1995 require.NoError(t, txn.Rollback()) 1996 registry := s.JobRegistry().(*jobs.Registry) 1997 _, err = registry.LoadJob(ctx, *job.ID()) 1998 require.Error(t, err, "the job should not exist after the txn is rolled back") 1999 require.True(t, jobs.HasJobNotFoundError(err)) 2000 2001 sqlRunner := sqlutils.MakeSQLRunner(sqlDB) 2002 // Just in case the job was scheduled let's wait for it to finish 2003 // to avoid a race. 2004 sqlRunner.Exec(t, "SHOW JOB WHEN COMPLETE $1", *job.ID()) 2005 require.Equal(t, int32(0), atomic.LoadInt32(&hasRun), 2006 "job has run in transaction before txn commit") 2007 2008 require.True(t, timeutil.Since(start) < jobs.DefaultAdoptInterval, "job should have been adopted immediately") 2009 }) 2010 2011 t.Run("normal success", func(t *testing.T) { 2012 start := timeutil.Now() 2013 2014 // Now let's actually commit the transaction and check that the job ran. 2015 txn, err := sqlDB.Begin() 2016 require.NoError(t, err) 2017 _, err = txn.Exec("BACKUP tocommit TO foo") 2018 require.NoError(t, err) 2019 // Committing will block and wait for all jobs to run. 2020 require.NoError(t, txn.Commit()) 2021 registry := s.JobRegistry().(*jobs.Registry) 2022 j, err := registry.LoadJob(ctx, *job.ID()) 2023 require.NoError(t, err, "queued job not found") 2024 require.NotEqual(t, int32(0), atomic.LoadInt32(&hasRun), 2025 "job scheduled in transaction did not run") 2026 require.Equal(t, int32(1), atomic.LoadInt32(&hasRun), 2027 "more than one job ran") 2028 require.Equal(t, "", j.Payload().Error) 2029 2030 require.True(t, timeutil.Since(start) < jobs.DefaultAdoptInterval, "job should have been adopted immediately") 2031 }) 2032 2033 t.Run("one of the queued jobs fails", func(t *testing.T) { 2034 start := timeutil.Now() 2035 txn, err := sqlDB.Begin() 2036 require.NoError(t, err) 2037 2038 // Add a succeeding job. 2039 _, err = txn.Exec("BACKUP doesnotmatter TO doesnotmattter") 2040 require.NoError(t, err) 2041 // We hooked up a failing test job to RESTORE. 2042 _, err = txn.Exec("RESTORE TABLE tbl FROM somewhere") 2043 require.NoError(t, err) 2044 2045 // Now let's actually commit the transaction and check that there is a 2046 // failure. 2047 require.Error(t, txn.Commit()) 2048 require.True(t, timeutil.Since(start) < jobs.DefaultAdoptInterval, "job should have been adopted immediately") 2049 }) 2050 } 2051 2052 // TestStartableJobErrors tests that the StartableJob returns the expected 2053 // errors when used incorrectly and performs the appropriate cleanup in 2054 // CleanupOnRollback. 2055 func TestStartableJob(t *testing.T) { 2056 defer leaktest.AfterTest(t)() 2057 defer jobs.ResetConstructors()() 2058 2059 ctx := context.Background() 2060 s, _, db := serverutils.StartServer(t, base.TestServerArgs{}) 2061 defer s.Stopper().Stop(ctx) 2062 jr := s.JobRegistry().(*jobs.Registry) 2063 var resumeFunc atomic.Value 2064 resumeFunc.Store(func(ctx context.Context, _ chan<- tree.Datums) error { 2065 return nil 2066 }) 2067 setResumeFunc := func(f func(ctx context.Context, _ chan<- tree.Datums) error) (cleanup func()) { 2068 prev := resumeFunc.Load() 2069 resumeFunc.Store(f) 2070 return func() { resumeFunc.Store(prev) } 2071 } 2072 jobs.RegisterConstructor(jobspb.TypeRestore, func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { 2073 return jobs.FakeResumer{ 2074 OnResume: func(ctx context.Context, resultsCh chan<- tree.Datums) error { 2075 return resumeFunc.Load().(func(ctx context.Context, _ chan<- tree.Datums) error)(ctx, resultsCh) 2076 }, 2077 } 2078 }) 2079 rec := jobs.Record{ 2080 Description: "There's a snake in my boot!", 2081 Username: "Woody Pride", 2082 DescriptorIDs: []sqlbase.ID{1, 2, 3}, 2083 Details: jobspb.RestoreDetails{}, 2084 Progress: jobspb.RestoreProgress{}, 2085 } 2086 createStartableJob := func(t *testing.T) (sj *jobs.StartableJob) { 2087 require.NoError(t, db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) { 2088 sj, err = jr.CreateStartableJobWithTxn(ctx, rec, txn, nil) 2089 return err 2090 })) 2091 return sj 2092 } 2093 t.Run("Start called more than once", func(t *testing.T) { 2094 sj := createStartableJob(t) 2095 _, err := sj.Start(ctx) 2096 require.NoError(t, err) 2097 _, err = sj.Start(ctx) 2098 require.Regexp(t, `StartableJob \d+ cannot be started more than once`, err) 2099 }) 2100 t.Run("Start called with active txn", func(t *testing.T) { 2101 txn := db.NewTxn(ctx, "test") 2102 defer func() { 2103 require.NoError(t, txn.Rollback(ctx)) 2104 }() 2105 sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil) 2106 require.NoError(t, err) 2107 _, err = sj.Start(ctx) 2108 require.Regexp(t, `cannot resume .* job which is not committed`, err) 2109 }) 2110 t.Run("Start called with aborted txn", func(t *testing.T) { 2111 txn := db.NewTxn(ctx, "test") 2112 sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil) 2113 require.NoError(t, err) 2114 require.NoError(t, txn.Rollback(ctx)) 2115 _, err = sj.Start(ctx) 2116 require.Regexp(t, `cannot resume .* job which is not committed`, err) 2117 }) 2118 t.Run("CleanupOnRollback called with active txn", func(t *testing.T) { 2119 txn := db.NewTxn(ctx, "test") 2120 defer func() { 2121 require.NoError(t, txn.Rollback(ctx)) 2122 }() 2123 sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil) 2124 require.NoError(t, err) 2125 err = sj.CleanupOnRollback(ctx) 2126 require.Regexp(t, `cannot call CleanupOnRollback for a StartableJob with a non-finalized transaction`, err) 2127 }) 2128 t.Run("CleanupOnRollback called with committed txn", func(t *testing.T) { 2129 sj := createStartableJob(t) 2130 err := sj.CleanupOnRollback(ctx) 2131 require.Regexp(t, `cannot call CleanupOnRollback for a StartableJob created by a committed transaction`, err) 2132 }) 2133 t.Run("CleanupOnRollback positive case", func(t *testing.T) { 2134 txn := db.NewTxn(ctx, "test") 2135 sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil) 2136 require.NoError(t, err) 2137 require.NoError(t, txn.Rollback(ctx)) 2138 require.NoError(t, sj.CleanupOnRollback(ctx)) 2139 for _, id := range jr.CurrentlyRunningJobs() { 2140 require.NotEqual(t, id, *sj.ID()) 2141 } 2142 }) 2143 t.Run("Cancel", func(t *testing.T) { 2144 txn := db.NewTxn(ctx, "test") 2145 sj, err := jr.CreateStartableJobWithTxn(ctx, rec, txn, nil) 2146 require.NoError(t, err) 2147 require.NoError(t, txn.Commit(ctx)) 2148 require.NoError(t, sj.Cancel(ctx)) 2149 status, err := sj.CurrentStatus(ctx) 2150 require.NoError(t, err) 2151 require.Equal(t, jobs.StatusCancelRequested, status) 2152 for _, id := range jr.CurrentlyRunningJobs() { 2153 require.NotEqual(t, id, *sj.ID()) 2154 } 2155 _, err = sj.Start(ctx) 2156 require.Regexp(t, "job with status cancel-requested cannot be marked started", err) 2157 }) 2158 setUpRunTest := func(t *testing.T) ( 2159 sj *jobs.StartableJob, 2160 resultCh <-chan tree.Datums, 2161 blockResume func() (waitForBlocked func() (resultsCh chan<- tree.Datums, unblockWithError func(error))), 2162 cleanup func(), 2163 ) { 2164 type blockResp struct { 2165 errCh chan error 2166 resultsCh chan<- tree.Datums 2167 } 2168 blockCh := make(chan chan blockResp, 1) 2169 blockResume = func() (waitForBlocked func() (resultsCh chan<- tree.Datums, unblockWithError func(error))) { 2170 blockRequest := make(chan blockResp, 1) 2171 blockCh <- blockRequest // from test to resumer 2172 return func() (resultsCh chan<- tree.Datums, unblockWithError func(error)) { 2173 blocked := <-blockRequest // from resumer to test 2174 return blocked.resultsCh, func(err error) { 2175 blocked.errCh <- err // from test to resumer 2176 } 2177 } 2178 } 2179 cleanup = setResumeFunc(func(ctx context.Context, resultsCh chan<- tree.Datums) error { 2180 select { 2181 case blockRequest := <-blockCh: 2182 unblock := make(chan error) 2183 blockRequest <- blockResp{ 2184 errCh: unblock, 2185 resultsCh: resultsCh, 2186 } 2187 if err := <-unblock; err != nil { 2188 return err 2189 } 2190 default: 2191 } 2192 return nil 2193 }) 2194 clientResults := make(chan tree.Datums) 2195 require.NoError(t, db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) { 2196 sj, err = jr.CreateStartableJobWithTxn(ctx, rec, txn, clientResults) 2197 return err 2198 })) 2199 return sj, clientResults, blockResume, cleanup 2200 } 2201 t.Run("Run - error during resume", func(t *testing.T) { 2202 sj, _, blockResume, cleanup := setUpRunTest(t) 2203 defer cleanup() 2204 waitForBlocked := blockResume() 2205 runErr := make(chan error) 2206 go func() { runErr <- sj.Run(ctx) }() 2207 _, unblock := waitForBlocked() 2208 unblock(errors.New("boom")) 2209 require.Regexp(t, "boom", <-runErr) 2210 }) 2211 t.Run("Run - client canceled", func(t *testing.T) { 2212 sj, _, blockResume, cleanup := setUpRunTest(t) 2213 defer cleanup() 2214 ctxToCancel, cancel := context.WithCancel(ctx) 2215 runErr := make(chan error) 2216 waitForBlocked := blockResume() 2217 go func() { runErr <- sj.Run(ctxToCancel) }() 2218 resultsCh, unblock := waitForBlocked() 2219 cancel() 2220 require.Regexp(t, context.Canceled, <-runErr) 2221 resultsCh <- tree.Datums{} 2222 unblock(nil) 2223 testutils.SucceedsSoon(t, func() error { 2224 loaded, err := jr.LoadJob(ctx, *sj.ID()) 2225 require.NoError(t, err) 2226 st, err := loaded.CurrentStatus(ctx) 2227 require.NoError(t, err) 2228 if st != jobs.StatusSucceeded { 2229 return errors.Errorf("expected %s, got %s", jobs.StatusSucceeded, st) 2230 } 2231 return nil 2232 }) 2233 }) 2234 t.Run("Run - results chan closed", func(t *testing.T) { 2235 sj, clientResultsChan, blockResume, cleanup := setUpRunTest(t) 2236 defer cleanup() 2237 runErr := make(chan error) 2238 waitForBlocked := blockResume() 2239 go func() { runErr <- sj.Run(ctx) }() 2240 resultsCh, unblock := waitForBlocked() 2241 go func() { 2242 _, ok := <-clientResultsChan 2243 assert.True(t, ok) 2244 _, ok = <-clientResultsChan 2245 assert.False(t, ok) 2246 unblock(nil) 2247 }() 2248 resultsCh <- tree.Datums{} 2249 close(resultsCh) 2250 require.NoError(t, <-runErr) 2251 }) 2252 } 2253 2254 // TestUnmigratedSchemaChangeJobs tests that schema change jobs created in 19.2 2255 // that have not undergone a migration cannot be adopted, canceled, or paused. 2256 func TestUnmigratedSchemaChangeJobs(t *testing.T) { 2257 defer leaktest.AfterTest(t)() 2258 defer jobs.ResetConstructors()() 2259 defer func(oldInterval time.Duration) { 2260 jobs.DefaultAdoptInterval = oldInterval 2261 }(jobs.DefaultAdoptInterval) 2262 jobs.DefaultAdoptInterval = 10 * time.Millisecond 2263 2264 ctx := context.Background() 2265 2266 s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) 2267 defer s.Stopper().Stop(ctx) 2268 2269 registry := s.JobRegistry().(*jobs.Registry) 2270 2271 // The default FormatVersion value in SchemaChangeDetails corresponds to a 2272 // pre-20.1 job. 2273 rec := jobs.Record{ 2274 DescriptorIDs: []sqlbase.ID{1}, 2275 Details: jobspb.SchemaChangeDetails{}, 2276 Progress: jobspb.SchemaChangeProgress{}, 2277 } 2278 2279 t.Run("job is not adopted", func(t *testing.T) { 2280 resuming := make(chan struct{}) 2281 jobs.RegisterConstructor(jobspb.TypeSchemaChange, func(_ *jobs.Job, _ *cluster.Settings) jobs.Resumer { 2282 return jobs.FakeResumer{ 2283 OnResume: func(ctx context.Context, _ chan<- tree.Datums) error { 2284 resuming <- struct{}{} 2285 return nil 2286 }, 2287 } 2288 }) 2289 select { 2290 case <-resuming: 2291 t.Fatal("job was resumed") 2292 case <-time.After(time.Second): 2293 // With an adopt interval of 10 ms, within 1 s we can be reasonably sure 2294 // that the job was not adopted. 2295 } 2296 }) 2297 2298 t.Run("pause not supported", func(t *testing.T) { 2299 job, err := registry.CreateJobWithTxn(ctx, rec, nil /* txn */) 2300 if err != nil { 2301 t.Fatal(err) 2302 } 2303 if _, err := sqlDB.Exec("PAUSE JOB $1", *job.ID()); !testutils.IsError(err, "cannot be paused in this version") { 2304 t.Fatal(err) 2305 } 2306 }) 2307 2308 t.Run("cancel not supported", func(t *testing.T) { 2309 job, err := registry.CreateJobWithTxn(ctx, rec, nil /* txn */) 2310 if err != nil { 2311 t.Fatal(err) 2312 } 2313 if _, err := sqlDB.Exec("CANCEL JOB $1", *job.ID()); !testutils.IsError(err, "cannot be canceled in this version") { 2314 t.Fatal(err) 2315 } 2316 }) 2317 }