github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/jobs.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package jobs 12 13 import ( 14 "context" 15 "fmt" 16 "reflect" 17 "sync/atomic" 18 19 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 20 "github.com/cockroachdb/cockroach/pkg/kv" 21 "github.com/cockroachdb/cockroach/pkg/security" 22 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 23 "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 25 "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" 26 "github.com/cockroachdb/cockroach/pkg/util/hlc" 27 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 28 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 29 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 30 "github.com/cockroachdb/errors" 31 ) 32 33 // Job manages logging the progress of long-running system processes, like 34 // backups and restores, to the system.jobs table. 35 type Job struct { 36 // TODO(benesch): avoid giving Job a reference to Registry. This will likely 37 // require inverting control: rather than having the worker call Created, 38 // Started, etc., have Registry call a setupFn and a workFn as appropriate. 39 registry *Registry 40 41 id *int64 42 txn *kv.Txn 43 mu struct { 44 syncutil.Mutex 45 payload jobspb.Payload 46 progress jobspb.Progress 47 } 48 } 49 50 // Record bundles together the user-managed fields in jobspb.Payload. 51 type Record struct { 52 Description string 53 Statement string 54 Username string 55 DescriptorIDs sqlbase.IDs 56 Details jobspb.Details 57 Progress jobspb.ProgressDetails 58 RunningStatus RunningStatus 59 // NonCancelable is used to denote when a job cannot be canceled. This field 60 // will not be respected in mixed version clusters where some nodes have 61 // a version < 20.1, so it can only be used in cases where all nodes having 62 // versions >= 20.1 is guaranteed. 63 NonCancelable bool 64 } 65 66 // StartableJob is a job created with a transaction to be started later. 67 // See Registry.CreateStartableJob 68 type StartableJob struct { 69 *Job 70 txn *kv.Txn 71 resumer Resumer 72 resumerCtx context.Context 73 cancel context.CancelFunc 74 resultsCh chan<- tree.Datums 75 starts int64 // used to detect multiple calls to Start() 76 } 77 78 func init() { 79 // NB: This exists to make the jobs payload usable during testrace. See the 80 // comment on protoutil.Clone and the implementation of Marshal when run under 81 // race. 82 var jobPayload jobspb.Payload 83 jobsDetailsInterfaceType := reflect.TypeOf(&jobPayload.Details).Elem() 84 var jobProgress jobspb.Progress 85 jobsProgressDetailsInterfaceType := reflect.TypeOf(&jobProgress.Details).Elem() 86 protoutil.RegisterUnclonableType(jobsDetailsInterfaceType, reflect.Array) 87 protoutil.RegisterUnclonableType(jobsProgressDetailsInterfaceType, reflect.Array) 88 89 } 90 91 // Status represents the status of a job in the system.jobs table. 92 type Status string 93 94 // RunningStatus represents the more detailed status of a running job in 95 // the system.jobs table. 96 type RunningStatus string 97 98 const ( 99 // StatusPending is for jobs that have been created but on which work has 100 // not yet started. 101 StatusPending Status = "pending" 102 // StatusRunning is for jobs that are currently in progress. 103 StatusRunning Status = "running" 104 // StatusPaused is for jobs that are not currently performing work, but have 105 // saved their state and can be resumed by the user later. 106 StatusPaused Status = "paused" 107 // StatusFailed is for jobs that failed. 108 StatusFailed Status = "failed" 109 // StatusReverting is for jobs that failed or were canceled and their changes are being 110 // being reverted. 111 StatusReverting Status = "reverting" 112 // StatusSucceeded is for jobs that have successfully completed. 113 StatusSucceeded Status = "succeeded" 114 // StatusCanceled is for jobs that were explicitly canceled by the user and 115 // cannot be resumed. 116 StatusCanceled Status = "canceled" 117 // StatusCancelRequested is for jobs that were requested to be canceled by 118 // the user but may be still running Resume. The node that is running the job 119 // will change it to StatusReverting the next time it runs maybeAdoptJobs. 120 StatusCancelRequested Status = "cancel-requested" 121 // StatusPauseRequested is for jobs that were requested to be paused by the 122 // user but may be still resuming or reverting. The node that is running the 123 // job will change its state to StatusPaused the next time it runs 124 // maybeAdoptJobs and will stop running it. 125 StatusPauseRequested Status = "pause-requested" 126 ) 127 128 var ( 129 errJobCanceled = errors.New("job canceled by user") 130 ) 131 132 // isOldSchemaChangeJob returns whether the provided payload is for a job that 133 // is a 19.2-style schema change, and therefore cannot be run or updated in 20.1 134 // (without first having undergone a migration). 135 // TODO (lucy): Remove this in 20.2. (I think it's possible in theory for a 19.2 136 // schema change job to persist on a 20.1 cluster indefinitely, since the 137 // migration is asynchronous, so this will take some care beyond just removing 138 // the format version gate.) 139 func isOldSchemaChangeJob(payload *jobspb.Payload) bool { 140 schemaChangeDetails, ok := payload.UnwrapDetails().(jobspb.SchemaChangeDetails) 141 return ok && schemaChangeDetails.FormatVersion < jobspb.JobResumerFormatVersion 142 } 143 144 // Terminal returns whether this status represents a "terminal" state: a state 145 // after which the job should never be updated again. 146 func (s Status) Terminal() bool { 147 return s == StatusFailed || s == StatusSucceeded || s == StatusCanceled 148 } 149 150 // InvalidStatusError is the error returned when the desired operation is 151 // invalid given the job's current status. 152 type InvalidStatusError struct { 153 id int64 154 status Status 155 op string 156 err string 157 } 158 159 func (e *InvalidStatusError) Error() string { 160 if e.err != "" { 161 return fmt.Sprintf("cannot %s %s job (id %d, err: %q)", e.op, e.status, e.id, e.err) 162 } 163 return fmt.Sprintf("cannot %s %s job (id %d)", e.op, e.status, e.id) 164 } 165 166 // SimplifyInvalidStatusError unwraps an *InvalidStatusError into an error 167 // message suitable for users. Other errors are returned as passed. 168 func SimplifyInvalidStatusError(err error) error { 169 if ierr := (*InvalidStatusError)(nil); errors.As(err, &ierr) { 170 return errors.Errorf("job %s", ierr.status) 171 } 172 return err 173 } 174 175 // ID returns the ID of the job that this Job is currently tracking. This will 176 // be nil if Created has not yet been called. 177 func (j *Job) ID() *int64 { 178 return j.id 179 } 180 181 // Created records the creation of a new job in the system.jobs table and 182 // remembers the assigned ID of the job in the Job. The job information is read 183 // from the Record field at the time Created is called. 184 func (j *Job) created(ctx context.Context) error { 185 if j.ID() != nil { 186 return errors.Errorf("job already created with ID %v", *j.ID()) 187 } 188 return j.insert(ctx, j.registry.makeJobID(), nil /* lease */) 189 } 190 191 // Started marks the tracked job as started. 192 func (j *Job) started(ctx context.Context) error { 193 return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error { 194 if md.Status != StatusPending && md.Status != StatusRunning { 195 return errors.Errorf("job with status %s cannot be marked started", md.Status) 196 } 197 // TODO(spaskob): Remove this status change after we stop supporting 198 // pending job states. 199 ju.UpdateStatus(StatusRunning) 200 md.Payload.StartedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime()) 201 ju.UpdatePayload(md.Payload) 202 return nil 203 }) 204 } 205 206 // CheckStatus verifies the status of the job and returns an error if the job's 207 // status isn't Running or Reverting. 208 func (j *Job) CheckStatus(ctx context.Context) error { 209 return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, _ *JobUpdater) error { 210 return md.CheckRunningOrReverting() 211 }) 212 } 213 214 // CheckTerminalStatus returns true if the job is in a terminal status. 215 func (j *Job) CheckTerminalStatus(ctx context.Context) bool { 216 err := j.Update(ctx, func(_ *kv.Txn, md JobMetadata, _ *JobUpdater) error { 217 if !md.Status.Terminal() { 218 return &InvalidStatusError{md.ID, md.Status, "checking that job status is success", md.Payload.Error} 219 } 220 return nil 221 }) 222 223 return err == nil 224 } 225 226 // RunningStatus updates the detailed status of a job currently in progress. 227 // It sets the job's RunningStatus field to the value returned by runningStatusFn 228 // and persists runningStatusFn's modifications to the job's details, if any. 229 func (j *Job) RunningStatus(ctx context.Context, runningStatusFn RunningStatusFn) error { 230 return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error { 231 if err := md.CheckRunningOrReverting(); err != nil { 232 return err 233 } 234 runningStatus, err := runningStatusFn(ctx, md.Progress.Details) 235 if err != nil { 236 return err 237 } 238 md.Progress.RunningStatus = string(runningStatus) 239 ju.UpdateProgress(md.Progress) 240 return nil 241 }) 242 } 243 244 // SetDescription updates the description of a created job. 245 func (j *Job) SetDescription(ctx context.Context, updateFn DescriptionUpdateFn) error { 246 return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error { 247 prev := md.Payload.Description 248 desc, err := updateFn(ctx, prev) 249 if err != nil { 250 return err 251 } 252 if prev != desc { 253 md.Payload.Description = desc 254 ju.UpdatePayload(md.Payload) 255 } 256 return nil 257 }) 258 } 259 260 // RunningStatusFn is a callback that computes a job's running status 261 // given its details. It is safe to modify details in the callback; those 262 // modifications will be automatically persisted to the database record. 263 type RunningStatusFn func(ctx context.Context, details jobspb.Details) (RunningStatus, error) 264 265 // DescriptionUpdateFn is a callback that computes a job's description 266 // given its current one. 267 type DescriptionUpdateFn func(ctx context.Context, description string) (string, error) 268 269 // FractionProgressedFn is a callback that computes a job's completion fraction 270 // given its details. It is safe to modify details in the callback; those 271 // modifications will be automatically persisted to the database record. 272 type FractionProgressedFn func(ctx context.Context, details jobspb.ProgressDetails) float32 273 274 // FractionUpdater returns a FractionProgressedFn that returns its argument. 275 func FractionUpdater(f float32) FractionProgressedFn { 276 return func(ctx context.Context, details jobspb.ProgressDetails) float32 { 277 return f 278 } 279 } 280 281 // HighWaterProgressedFn is a callback that computes a job's high-water mark 282 // given its details. It is safe to modify details in the callback; those 283 // modifications will be automatically persisted to the database record. 284 type HighWaterProgressedFn func(ctx context.Context, txn *kv.Txn, details jobspb.ProgressDetails) (hlc.Timestamp, error) 285 286 // FractionProgressed updates the progress of the tracked job. It sets the job's 287 // FractionCompleted field to the value returned by progressedFn and persists 288 // progressedFn's modifications to the job's progress details, if any. 289 // 290 // Jobs for which progress computations do not depend on their details can 291 // use the FractionUpdater helper to construct a ProgressedFn. 292 func (j *Job) FractionProgressed(ctx context.Context, progressedFn FractionProgressedFn) error { 293 return j.Update(ctx, func(_ *kv.Txn, md JobMetadata, ju *JobUpdater) error { 294 if err := md.CheckRunningOrReverting(); err != nil { 295 return err 296 } 297 fractionCompleted := progressedFn(ctx, md.Progress.Details) 298 // allow for slight floating-point rounding inaccuracies 299 if fractionCompleted > 1.0 && fractionCompleted < 1.01 { 300 fractionCompleted = 1.0 301 } 302 if fractionCompleted < 0.0 || fractionCompleted > 1.0 { 303 return errors.Errorf( 304 "Job: fractionCompleted %f is outside allowable range [0.0, 1.0] (job %d)", 305 fractionCompleted, *j.ID(), 306 ) 307 } 308 md.Progress.Progress = &jobspb.Progress_FractionCompleted{ 309 FractionCompleted: fractionCompleted, 310 } 311 ju.UpdateProgress(md.Progress) 312 return nil 313 }) 314 } 315 316 // HighWaterProgressed updates the progress of the tracked job. It sets the 317 // job's HighWater field to the value returned by progressedFn and persists 318 // progressedFn's modifications to the job's progress details, if any. 319 func (j *Job) HighWaterProgressed(ctx context.Context, progressedFn HighWaterProgressedFn) error { 320 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 321 if err := md.CheckRunningOrReverting(); err != nil { 322 return err 323 } 324 highWater, err := progressedFn(ctx, txn, md.Progress.Details) 325 if err != nil { 326 return err 327 } 328 if highWater.Less(hlc.Timestamp{}) { 329 return errors.Errorf( 330 "Job: high-water %s is outside allowable range > 0.0 (job %d)", 331 highWater, *j.ID(), 332 ) 333 } 334 md.Progress.Progress = &jobspb.Progress_HighWater{ 335 HighWater: &highWater, 336 } 337 ju.UpdateProgress(md.Progress) 338 return nil 339 }) 340 } 341 342 // paused sets the status of the tracked job to paused. It is called by the 343 // registry adoption loop by the node currently running a job to move it from 344 // pauseRequested to paused. 345 func (j *Job) paused(ctx context.Context, fn func(context.Context, *kv.Txn) error) error { 346 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 347 if md.Status == StatusPaused { 348 // Already paused - do nothing. 349 return nil 350 } 351 if md.Status != StatusPauseRequested { 352 return fmt.Errorf("job with status %s cannot be set to paused", md.Status) 353 } 354 if fn != nil { 355 if err := fn(ctx, txn); err != nil { 356 return err 357 } 358 } 359 ju.UpdateStatus(StatusPaused) 360 return nil 361 }) 362 } 363 364 // resumed sets the status of the tracked job to running or reverting iff the 365 // job is currently paused. It does not directly resume the job; rather, it 366 // expires the job's lease so that a Registry adoption loop detects it and 367 // resumes it. 368 func (j *Job) resumed(ctx context.Context) error { 369 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 370 if md.Status == StatusRunning || md.Status == StatusReverting { 371 // Already resumed - do nothing. 372 return nil 373 } 374 if md.Status != StatusPaused { 375 return fmt.Errorf("job with status %s cannot be resumed", md.Status) 376 } 377 // We use the absence of error to determine what state we should 378 // resume into. 379 if md.Payload.FinalResumeError == nil { 380 ju.UpdateStatus(StatusRunning) 381 } else { 382 ju.UpdateStatus(StatusReverting) 383 } 384 // NB: A nil lease indicates the job is not resumable, whereas an empty 385 // lease is always considered expired. 386 md.Payload.Lease = &jobspb.Lease{} 387 ju.UpdatePayload(md.Payload) 388 return nil 389 }) 390 } 391 392 // cancelRequested sets the status of the tracked job to cancel-requested. It 393 // does not directly cancel the job; like job.Paused, it expects the job to call 394 // job.Progressed soon, observe a "job is cancel-requested" error, and abort. 395 // Further the node the runs the job will actively cancel it when it notices 396 // that it is in state StatusCancelRequested and will move it to state 397 // StatusReverting. 398 func (j *Job) cancelRequested(ctx context.Context, fn func(context.Context, *kv.Txn) error) error { 399 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 400 // Don't allow 19.2-style schema change jobs to undergo changes in job state 401 // before they undergo a migration to make them properly runnable in 20.1 and 402 // later versions. While we could support cancellation in principle, the 403 // point is to cut down on the number of possible states that the migration 404 // could encounter. 405 // 406 // TODO (lucy): Remove this in 20.2. 407 if isOldSchemaChangeJob(md.Payload) { 408 return errors.Newf( 409 "schema change job was created in earlier version, and cannot be " + 410 "canceled in this version until the upgrade is finalized and an internal migration is complete") 411 } 412 413 if md.Payload.Noncancelable { 414 return errors.Newf("job %d: not cancelable", *j.ID()) 415 } 416 if md.Status == StatusCancelRequested || md.Status == StatusCanceled { 417 return nil 418 } 419 if md.Status != StatusPending && md.Status != StatusRunning && md.Status != StatusPaused { 420 return fmt.Errorf("job with status %s cannot be requested to be canceled", md.Status) 421 } 422 if md.Status == StatusPaused && md.Payload.FinalResumeError != nil { 423 decodedErr := errors.DecodeError(ctx, *md.Payload.FinalResumeError) 424 return fmt.Errorf("job %d is paused and has non-nil FinalResumeError %s hence cannot be canceled and should be reverted", *j.ID(), decodedErr.Error()) 425 } 426 if fn != nil { 427 if err := fn(ctx, txn); err != nil { 428 return err 429 } 430 } 431 ju.UpdateStatus(StatusCancelRequested) 432 return nil 433 }) 434 } 435 436 // onPauseRequestFunc is a function used to perform action on behalf of a job 437 // implementation when a pause is requested. 438 type onPauseRequestFunc func( 439 ctx context.Context, planHookState interface{}, txn *kv.Txn, progress *jobspb.Progress, 440 ) error 441 442 // pauseRequested sets the status of the tracked job to pause-requested. It does 443 // not directly pause the job; it expects the node that runs the job will 444 // actively cancel it when it notices that it is in state StatusPauseRequested 445 // and will move it to state StatusPaused. 446 func (j *Job) pauseRequested(ctx context.Context, fn onPauseRequestFunc) error { 447 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 448 // Don't allow 19.2-style schema change jobs to undergo changes in job state 449 // before they undergo a migration to make them properly runnable in 20.1 and 450 // later versions. 451 // 452 // In particular, schema change jobs could not be paused in 19.2, so allowing 453 // pausing here could break backward compatibility during an upgrade by 454 // forcing 19.2 nodes to deal with a schema change job in a state that wasn't 455 // possible in 19.2. 456 // 457 // TODO (lucy): Remove this in 20.2. 458 if isOldSchemaChangeJob(md.Payload) { 459 return errors.Newf( 460 "schema change job was created in earlier version, and cannot be " + 461 "paused in this version until the upgrade is finalized and an internal migration is complete") 462 } 463 464 if md.Status == StatusPauseRequested || md.Status == StatusPaused { 465 return nil 466 } 467 if md.Status != StatusPending && md.Status != StatusRunning && md.Status != StatusReverting { 468 return fmt.Errorf("job with status %s cannot be requested to be paused", md.Status) 469 } 470 if fn != nil { 471 phs, cleanup := j.registry.planFn("pause request", j.Payload().Username) 472 defer cleanup() 473 if err := fn(ctx, phs, txn, md.Progress); err != nil { 474 return err 475 } 476 ju.UpdateProgress(md.Progress) 477 } 478 ju.UpdateStatus(StatusPauseRequested) 479 return nil 480 }) 481 } 482 483 // reverted sets the status of the tracked job to reverted. 484 func (j *Job) reverted( 485 ctx context.Context, err error, fn func(context.Context, *kv.Txn) error, 486 ) error { 487 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 488 if md.Status == StatusReverting { 489 return nil 490 } 491 if md.Status != StatusCancelRequested && md.Status != StatusRunning && md.Status != StatusPending { 492 return fmt.Errorf("job with status %s cannot be reverted", md.Status) 493 } 494 if fn != nil { 495 if err := fn(ctx, txn); err != nil { 496 return err 497 } 498 } 499 if err != nil { 500 md.Payload.Error = err.Error() 501 encodedErr := errors.EncodeError(ctx, err) 502 md.Payload.FinalResumeError = &encodedErr 503 ju.UpdatePayload(md.Payload) 504 } else { 505 if md.Payload.FinalResumeError == nil { 506 return errors.AssertionFailedf( 507 "tried to mark job as reverting, but no error was provided or recorded") 508 } 509 } 510 ju.UpdateStatus(StatusReverting) 511 return nil 512 }) 513 } 514 515 // Canceled sets the status of the tracked job to cancel. 516 func (j *Job) canceled(ctx context.Context, fn func(context.Context, *kv.Txn) error) error { 517 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 518 if md.Status == StatusCanceled { 519 return nil 520 } 521 if md.Status != StatusReverting { 522 return fmt.Errorf("job with status %s cannot be requested to be canceled", md.Status) 523 } 524 if fn != nil { 525 if err := fn(ctx, txn); err != nil { 526 return err 527 } 528 } 529 ju.UpdateStatus(StatusCanceled) 530 md.Payload.FinishedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime()) 531 ju.UpdatePayload(md.Payload) 532 return nil 533 }) 534 } 535 536 // Failed marks the tracked job as having failed with the given error. 537 func (j *Job) failed( 538 ctx context.Context, err error, fn func(context.Context, *kv.Txn) error, 539 ) error { 540 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 541 // TODO(spaskob): should we fail if the terminal state is not StatusFailed? 542 if md.Status.Terminal() { 543 // Already done - do nothing. 544 return nil 545 } 546 if fn != nil { 547 if err := fn(ctx, txn); err != nil { 548 return err 549 } 550 } 551 ju.UpdateStatus(StatusFailed) 552 md.Payload.Error = err.Error() 553 md.Payload.FinishedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime()) 554 ju.UpdatePayload(md.Payload) 555 return nil 556 }) 557 } 558 559 // succeeded marks the tracked job as having succeeded and sets its fraction 560 // completed to 1.0. 561 func (j *Job) succeeded(ctx context.Context, fn func(context.Context, *kv.Txn) error) error { 562 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 563 if md.Status == StatusSucceeded { 564 return nil 565 } 566 if md.Status != StatusRunning && md.Status != StatusPending { 567 return errors.Errorf("Job with status %s cannot be marked as succeeded", md.Status) 568 } 569 if fn != nil { 570 if err := fn(ctx, txn); err != nil { 571 return err 572 } 573 } 574 ju.UpdateStatus(StatusSucceeded) 575 md.Payload.FinishedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime()) 576 ju.UpdatePayload(md.Payload) 577 md.Progress.Progress = &jobspb.Progress_FractionCompleted{ 578 FractionCompleted: 1.0, 579 } 580 ju.UpdateProgress(md.Progress) 581 return nil 582 }) 583 } 584 585 // SetDetails sets the details field of the currently running tracked job. 586 func (j *Job) SetDetails(ctx context.Context, details interface{}) error { 587 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 588 md.Payload.Details = jobspb.WrapPayloadDetails(details) 589 ju.UpdatePayload(md.Payload) 590 return nil 591 }) 592 } 593 594 // SetProgress sets the details field of the currently running tracked job. 595 func (j *Job) SetProgress(ctx context.Context, details interface{}) error { 596 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 597 md.Progress.Details = jobspb.WrapProgressDetails(details) 598 ju.UpdateProgress(md.Progress) 599 return nil 600 }) 601 } 602 603 // Payload returns the most recently sent Payload for this Job. 604 func (j *Job) Payload() jobspb.Payload { 605 j.mu.Lock() 606 defer j.mu.Unlock() 607 return j.mu.payload 608 } 609 610 // Progress returns the most recently sent Progress for this Job. 611 func (j *Job) Progress() jobspb.Progress { 612 j.mu.Lock() 613 defer j.mu.Unlock() 614 return j.mu.progress 615 } 616 617 // Details returns the details from the most recently sent Payload for this Job. 618 func (j *Job) Details() jobspb.Details { 619 j.mu.Lock() 620 defer j.mu.Unlock() 621 return j.mu.payload.UnwrapDetails() 622 } 623 624 // FractionCompleted returns completion according to the in-memory job state. 625 func (j *Job) FractionCompleted() float32 { 626 progress := j.Progress() 627 return progress.GetFractionCompleted() 628 } 629 630 // WithTxn sets the transaction that this Job will use for its next operation. 631 // If the transaction is nil, the Job will create a one-off transaction instead. 632 // If you use WithTxn, this Job will no longer be threadsafe. 633 func (j *Job) WithTxn(txn *kv.Txn) *Job { 634 j.txn = txn 635 return j 636 } 637 638 // MakeSessionBoundInternalExecutor makes an internal executor, for use in a job 639 // resumer, and sets it with the provided session data. See the comment on 640 // sessionBoundInternalExecutorFactory for a more detailed explanation of why 641 // this exists. 642 func (j *Job) MakeSessionBoundInternalExecutor( 643 ctx context.Context, sd *sessiondata.SessionData, 644 ) sqlutil.InternalExecutor { 645 return j.registry.sessionBoundInternalExecutorFactory(ctx, sd) 646 } 647 648 func (j *Job) runInTxn(ctx context.Context, fn func(context.Context, *kv.Txn) error) error { 649 if j.txn != nil { 650 defer func() { j.txn = nil }() 651 // Don't run fn in a retry loop because we need retryable errors to 652 // propagate up to the transaction's properly-scoped retry loop. 653 return fn(ctx, j.txn) 654 } 655 return j.registry.db.Txn(ctx, fn) 656 } 657 658 // JobNotFoundError is returned from load when the job does not exist. 659 type JobNotFoundError struct { 660 jobID int64 661 } 662 663 // Error makes JobNotFoundError an error. 664 func (e *JobNotFoundError) Error() string { 665 return fmt.Sprintf("job with ID %d does not exist", e.jobID) 666 } 667 668 // HasJobNotFoundError returns true if the error contains a JobNotFoundError. 669 func HasJobNotFoundError(err error) bool { 670 return errors.HasType(err, (*JobNotFoundError)(nil)) 671 } 672 673 func (j *Job) load(ctx context.Context) error { 674 var payload *jobspb.Payload 675 var progress *jobspb.Progress 676 if err := j.runInTxn(ctx, func(ctx context.Context, txn *kv.Txn) error { 677 const stmt = "SELECT payload, progress FROM system.jobs WHERE id = $1" 678 row, err := j.registry.ex.QueryRowEx( 679 ctx, "load-job-query", txn, sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 680 stmt, *j.ID()) 681 if err != nil { 682 return err 683 } 684 if row == nil { 685 return &JobNotFoundError{jobID: *j.ID()} 686 } 687 payload, err = UnmarshalPayload(row[0]) 688 if err != nil { 689 return err 690 } 691 progress, err = UnmarshalProgress(row[1]) 692 return err 693 }); err != nil { 694 return err 695 } 696 j.mu.payload = *payload 697 j.mu.progress = *progress 698 return nil 699 } 700 701 func (j *Job) insert(ctx context.Context, id int64, lease *jobspb.Lease) error { 702 if j.id != nil { 703 // Already created - do nothing. 704 return nil 705 } 706 707 j.mu.payload.Lease = lease 708 709 if err := j.runInTxn(ctx, func(ctx context.Context, txn *kv.Txn) error { 710 // Note: although the following uses ReadTimestamp and 711 // ReadTimestamp can diverge from the value of now() throughout a 712 // transaction, this may be OK -- we merely required ModifiedMicro 713 // to be equal *or greater* than previously inserted timestamps 714 // computed by now(). For now ReadTimestamp can only move forward 715 // and the assertion ReadTimestamp >= now() holds at all times. 716 j.mu.progress.ModifiedMicros = timeutil.ToUnixMicros(txn.ReadTimestamp().GoTime()) 717 payloadBytes, err := protoutil.Marshal(&j.mu.payload) 718 if err != nil { 719 return err 720 } 721 progressBytes, err := protoutil.Marshal(&j.mu.progress) 722 if err != nil { 723 return err 724 } 725 726 const stmt = "INSERT INTO system.jobs (id, status, payload, progress) VALUES ($1, $2, $3, $4)" 727 _, err = j.registry.ex.Exec(ctx, "job-insert", txn, stmt, id, StatusRunning, payloadBytes, progressBytes) 728 return err 729 }); err != nil { 730 return err 731 } 732 j.id = &id 733 return nil 734 } 735 736 func (j *Job) adopt(ctx context.Context, oldLease *jobspb.Lease) error { 737 return j.Update(ctx, func(txn *kv.Txn, md JobMetadata, ju *JobUpdater) error { 738 if !md.Payload.Lease.Equal(oldLease) { 739 return errors.Errorf("current lease %v did not match expected lease %v", 740 md.Payload.Lease, oldLease) 741 } 742 md.Payload.Lease = j.registry.newLease() 743 if md.Payload.StartedMicros == 0 { 744 md.Payload.StartedMicros = timeutil.ToUnixMicros(j.registry.clock.Now().GoTime()) 745 } 746 ju.UpdatePayload(md.Payload) 747 return nil 748 }) 749 } 750 751 // UnmarshalPayload unmarshals and returns the Payload encoded in the input 752 // datum, which should be a tree.DBytes. 753 func UnmarshalPayload(datum tree.Datum) (*jobspb.Payload, error) { 754 payload := &jobspb.Payload{} 755 bytes, ok := datum.(*tree.DBytes) 756 if !ok { 757 return nil, errors.Errorf( 758 "Job: failed to unmarshal payload as DBytes (was %T)", datum) 759 } 760 if err := protoutil.Unmarshal([]byte(*bytes), payload); err != nil { 761 return nil, err 762 } 763 return payload, nil 764 } 765 766 // UnmarshalProgress unmarshals and returns the Progress encoded in the input 767 // datum, which should be a tree.DBytes. 768 func UnmarshalProgress(datum tree.Datum) (*jobspb.Progress, error) { 769 progress := &jobspb.Progress{} 770 bytes, ok := datum.(*tree.DBytes) 771 if !ok { 772 return nil, errors.Errorf( 773 "Job: failed to unmarshal Progress as DBytes (was %T)", datum) 774 } 775 if err := protoutil.Unmarshal([]byte(*bytes), progress); err != nil { 776 return nil, err 777 } 778 return progress, nil 779 } 780 781 // CurrentStatus returns the current job status from the jobs table or error. 782 func (j *Job) CurrentStatus(ctx context.Context) (Status, error) { 783 if j.id == nil { 784 return "", errors.New("job has not been created") 785 } 786 var statusString tree.DString 787 if err := j.runInTxn(ctx, func(ctx context.Context, txn *kv.Txn) error { 788 const selectStmt = "SELECT status FROM system.jobs WHERE id = $1" 789 row, err := j.registry.ex.QueryRow(ctx, "job-status", txn, selectStmt, *j.ID()) 790 if err != nil { 791 return errors.Wrapf(err, "job %d: can't query system.jobs", *j.ID()) 792 } 793 if row == nil { 794 return errors.Errorf("job %d: not found in system.jobs", *j.ID()) 795 } 796 797 statusString = tree.MustBeDString(row[0]) 798 return nil 799 }); err != nil { 800 return "", err 801 } 802 return Status(statusString), nil 803 } 804 805 // Start will resume the job. The transaction used to create the StartableJob 806 // must be committed. If a non-nil error is returned, the job was not started 807 // and nothing will be send on errCh. Clients must not start jobs more than 808 // once. 809 func (sj *StartableJob) Start(ctx context.Context) (errCh <-chan error, err error) { 810 if starts := atomic.AddInt64(&sj.starts, 1); starts != 1 { 811 return nil, errors.AssertionFailedf( 812 "StartableJob %d cannot be started more than once", *sj.ID()) 813 } 814 defer func() { 815 if err != nil { 816 sj.registry.unregister(*sj.ID()) 817 } 818 }() 819 if !sj.txn.IsCommitted() { 820 return nil, fmt.Errorf("cannot resume %T job which is not committed", sj.resumer) 821 } 822 if err := sj.started(ctx); err != nil { 823 return nil, err 824 } 825 errCh, err = sj.registry.resume(sj.resumerCtx, sj.resumer, sj.resultsCh, sj.Job) 826 if err != nil { 827 return nil, err 828 } 829 return errCh, nil 830 } 831 832 // Run will resume the job and wait for it to finish or the context to be 833 // canceled. The transaction used to create the StartableJob must be committed. 834 // Results will be copied to the channel used to create this StartableJob 835 // even if job is canceled. 836 func (sj *StartableJob) Run(ctx context.Context) error { 837 resultsFromJob := make(chan tree.Datums) 838 resultsCh := sj.resultsCh 839 sj.resultsCh = resultsFromJob 840 errCh, err := sj.Start(ctx) 841 if err != nil { 842 return err 843 } 844 jobCompletedOk := false 845 846 var r tree.Datums // stores a row if we've received one. 847 for { 848 // Alternate between receiving rows and sending them. Nil channels block. 849 var fromJob <-chan tree.Datums 850 var toClient chan<- tree.Datums 851 if r == nil { 852 fromJob = resultsFromJob 853 } else { 854 toClient = resultsCh 855 } 856 var ok bool 857 select { 858 case r, ok = <-fromJob: 859 // If the results channel is closed, set it to nil so that we don't 860 // loop infinitely. We still want to wait for the job to notify us on 861 // errCh. 862 if !ok { 863 close(resultsCh) 864 resultsCh, resultsFromJob = nil, nil 865 } 866 case toClient <- r: 867 r = nil 868 if jobCompletedOk { 869 return nil 870 } 871 case <-ctx.Done(): 872 // Launch a goroutine to continue consuming results from the job. 873 if resultsFromJob != nil { 874 go sj.registry.stopper.RunWorker(ctx, func(ctx context.Context) { 875 for { 876 select { 877 case <-errCh: 878 return 879 case _, ok := <-resultsFromJob: 880 if !ok { 881 return 882 } 883 } 884 } 885 }) 886 } 887 return ctx.Err() 888 case err := <-errCh: 889 // The job has completed, return its final error. 890 if err == nil && r != nil { 891 // We still have data to send to the client. 892 jobCompletedOk = true 893 continue 894 } 895 return err 896 } 897 } 898 } 899 900 // CleanupOnRollback will unregister the job in the case that the creating 901 // transaction has been rolled back. 902 func (sj *StartableJob) CleanupOnRollback(ctx context.Context) error { 903 if sj.txn.IsCommitted() { 904 return errors.AssertionFailedf( 905 "cannot call CleanupOnRollback for a StartableJob created by a committed transaction") 906 } 907 if !sj.txn.Sender().TxnStatus().IsFinalized() { 908 return errors.AssertionFailedf( 909 "cannot call CleanupOnRollback for a StartableJob with a non-finalized transaction") 910 } 911 sj.registry.unregister(*sj.ID()) 912 return nil 913 } 914 915 // Cancel will mark the job as canceled and release its resources in the 916 // Registry. 917 func (sj *StartableJob) Cancel(ctx context.Context) error { 918 defer sj.registry.unregister(*sj.ID()) 919 return sj.registry.CancelRequested(ctx, nil, *sj.ID()) 920 }