github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/jobs/registry.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package jobs 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math/rand" 18 "os" 19 "strings" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/base" 23 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 24 "github.com/cockroachdb/cockroach/pkg/keys" 25 "github.com/cockroachdb/cockroach/pkg/kv" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/security" 28 "github.com/cockroachdb/cockroach/pkg/settings" 29 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 30 "github.com/cockroachdb/cockroach/pkg/sql/sem/builtins" 31 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 32 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 33 "github.com/cockroachdb/cockroach/pkg/sql/sqlutil" 34 "github.com/cockroachdb/cockroach/pkg/sql/types" 35 "github.com/cockroachdb/cockroach/pkg/util/envutil" 36 "github.com/cockroachdb/cockroach/pkg/util/hlc" 37 "github.com/cockroachdb/cockroach/pkg/util/log" 38 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 39 "github.com/cockroachdb/cockroach/pkg/util/retry" 40 "github.com/cockroachdb/cockroach/pkg/util/stop" 41 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 42 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 43 "github.com/cockroachdb/errors" 44 "github.com/cockroachdb/logtags" 45 opentracing "github.com/opentracing/opentracing-go" 46 ) 47 48 const defaultLeniencySetting = 60 * time.Second 49 50 // See https://github.com/cockroachdb/cockroach/issues/47892. 51 const multiTenancyIssueNo = 47892 52 53 var ( 54 nodeLivenessLogLimiter = log.Every(5 * time.Second) 55 // LeniencySetting is the amount of time to defer any attempts to 56 // reschedule a job. Visible for testing. 57 LeniencySetting = settings.RegisterDurationSetting( 58 "jobs.registry.leniency", 59 "the amount of time to defer any attempts to reschedule a job", 60 defaultLeniencySetting) 61 gcSetting = settings.RegisterDurationSetting( 62 "jobs.retention_time", 63 "the amount of time to retain records for completed jobs before", 64 time.Hour*24*14) 65 ) 66 67 // Registry creates Jobs and manages their leases and cancelation. 68 // 69 // Job information is stored in the `system.jobs` table. Each node will 70 // poll this table and establish a lease on any claimed job. Registry 71 // calculates its own liveness for a node based on the expiration time 72 // of the underlying node-liveness lease. This is because we want to 73 // allow jobs assigned to temporarily non-live (i.e. saturated) nodes to 74 // continue without being canceled. 75 // 76 // When a lease has been determined to be stale, a node may attempt to 77 // claim the relevant job. Thus, a Registry must occasionally 78 // re-validate its own leases to ensure that another node has not stolen 79 // the work and cancel the local job if so. 80 // 81 // Prior versions of Registry used the node's epoch value to determine 82 // whether or not a job should be stolen. The current implementation 83 // uses a time-based approach, where a node's last reported expiration 84 // timestamp is used to calculate a liveness value for the purpose 85 // of job scheduling. 86 // 87 // Mixed-version operation between epoch- and time-based nodes works 88 // since we still publish epoch information in the leases for time-based 89 // nodes. From the perspective of a time-based node, an epoch-based 90 // node simply behaves as though its leniency period is 0. Epoch-based 91 // nodes will see time-based nodes delay the act of stealing a job. 92 type Registry struct { 93 ac log.AmbientContext 94 stopper *stop.Stopper 95 nl sqlbase.OptionalNodeLiveness 96 db *kv.DB 97 ex sqlutil.InternalExecutor 98 clock *hlc.Clock 99 nodeID *base.SQLIDContainer 100 settings *cluster.Settings 101 planFn planHookMaker 102 metrics Metrics 103 adoptionCh chan struct{} 104 105 // sessionBoundInternalExecutorFactory provides a way for jobs to create 106 // internal executors. This is rarely needed, and usually job resumers should 107 // use the internal executor from the PlanHookState. The intended user of this 108 // interface is the schema change job resumer, which needs to set the 109 // tableCollectionModifier on the internal executor to different values in 110 // multiple concurrent queries. This situation is an exception to the internal 111 // executor generally being a stateless wrapper, and makes it impossible to 112 // reuse the same internal executor across all the queries (without 113 // refactoring to get rid of the tableCollectionModifier field, which we 114 // should do eventually). 115 // 116 // Note that, while this API is not ideal, internal executors are basically 117 // lightweight wrappers requiring no additional teardown. There's not much 118 // cost incurred in creating these. 119 // 120 // TODO (lucy): We should refactor and get rid of the tableCollectionModifier 121 // field. Modifying the TableCollection is basically a per-query operation 122 // and should be a per-query setting. #34304 is the issue for creating/ 123 // improving this API. 124 sessionBoundInternalExecutorFactory sqlutil.SessionBoundInternalExecutorFactory 125 126 // if non-empty, indicates path to file that prevents any job adoptions. 127 preventAdoptionFile string 128 129 mu struct { 130 syncutil.Mutex 131 // epoch is present to support older nodes that are not using 132 // the timestamp-based approach to determine when to steal jobs. 133 // TODO: Remove this and deprecate Lease.Epoch proto field 134 epoch int64 135 // jobs holds a map from job id to its context cancel func. This should 136 // be populated with jobs that are currently being run (and owned) by 137 // this registry. Calling the func will cancel the context the job was 138 // started/resumed with. This should only be called by the registry when 139 // it is attempting to halt its own jobs due to liveness problems. Jobs 140 // are normally canceled on any node by the CANCEL JOB statement, which is 141 // propagated to jobs via the .Progressed call. This function should not be 142 // used to cancel a job in that way. 143 jobs map[int64]context.CancelFunc 144 } 145 146 TestingResumerCreationKnobs map[jobspb.Type]func(Resumer) Resumer 147 } 148 149 // planHookMaker is a wrapper around sql.NewInternalPlanner. It returns an 150 // *sql.planner as an interface{} due to package dependency cycles. It should 151 // be cast to that type in the sql package when it is used. Returns a cleanup 152 // function that must be called once the caller is done with the planner. 153 // 154 // TODO(mjibson): Can we do something to avoid passing an interface{} here 155 // that must be type casted in a Resumer? It cannot be done here because 156 // PlanHookState lives in the sql package, which would create a dependency 157 // cycle if listed here. Furthermore, moving PlanHookState into a common 158 // subpackage like sqlbase is difficult because of the amount of sql-only 159 // stuff that PlanHookState exports. One other choice is to merge this package 160 // back into the sql package. There's maybe a better way that I'm unaware of. 161 type planHookMaker func(opName, user string) (interface{}, func()) 162 163 // PreventAdoptionFile is the name of the file which, if present in the first 164 // on-disk store, will prevent the adoption of background jobs by that node. 165 const PreventAdoptionFile = "DISABLE_STARTING_BACKGROUND_JOBS" 166 167 // MakeRegistry creates a new Registry. planFn is a wrapper around 168 // sql.newInternalPlanner. It returns a sql.PlanHookState, but must be 169 // coerced into that in the Resumer functions. 170 func MakeRegistry( 171 ac log.AmbientContext, 172 stopper *stop.Stopper, 173 clock *hlc.Clock, 174 nl sqlbase.OptionalNodeLiveness, 175 db *kv.DB, 176 ex sqlutil.InternalExecutor, 177 nodeID *base.SQLIDContainer, 178 settings *cluster.Settings, 179 histogramWindowInterval time.Duration, 180 planFn planHookMaker, 181 preventAdoptionFile string, 182 ) *Registry { 183 r := &Registry{ 184 ac: ac, 185 stopper: stopper, 186 clock: clock, 187 nl: nl, 188 db: db, 189 ex: ex, 190 nodeID: nodeID, 191 settings: settings, 192 planFn: planFn, 193 preventAdoptionFile: preventAdoptionFile, 194 adoptionCh: make(chan struct{}), 195 } 196 r.mu.epoch = 1 197 r.mu.jobs = make(map[int64]context.CancelFunc) 198 r.metrics.InitHooks(histogramWindowInterval) 199 return r 200 } 201 202 // SetSessionBoundInternalExecutorFactory sets the 203 // SessionBoundInternalExecutorFactory that will be used by the job registry 204 // executor. We expose this separately from the constructor to avoid a circular 205 // dependency. 206 func (r *Registry) SetSessionBoundInternalExecutorFactory( 207 factory sqlutil.SessionBoundInternalExecutorFactory, 208 ) { 209 r.sessionBoundInternalExecutorFactory = factory 210 } 211 212 // MetricsStruct returns the metrics for production monitoring of each job type. 213 // They're all stored as the `metric.Struct` interface because of dependency 214 // cycles. 215 func (r *Registry) MetricsStruct() *Metrics { 216 return &r.metrics 217 } 218 219 // CurrentlyRunningJobs returns a slice of the ids of all jobs running on this node. 220 func (r *Registry) CurrentlyRunningJobs() []int64 { 221 r.mu.Lock() 222 defer r.mu.Unlock() 223 jobs := make([]int64, len(r.mu.jobs)) 224 i := 0 225 for jID := range r.mu.jobs { 226 jobs[i] = jID 227 i++ 228 } 229 return jobs 230 } 231 232 // lenientNow returns the timestamp after which we should attempt 233 // to steal a job from a node whose liveness is failing. This allows 234 // jobs coordinated by a node which is temporarily saturated to continue. 235 func (r *Registry) lenientNow() time.Time { 236 // We see this in tests. 237 var offset time.Duration 238 if r.settings == cluster.NoSettings { 239 offset = defaultLeniencySetting 240 } else { 241 offset = LeniencySetting.Get(&r.settings.SV) 242 } 243 244 return r.clock.Now().GoTime().Add(-offset) 245 } 246 247 // makeCtx returns a new context from r's ambient context and an associated 248 // cancel func. 249 func (r *Registry) makeCtx() (context.Context, func()) { 250 return context.WithCancel(r.ac.AnnotateCtx(context.Background())) 251 } 252 253 func (r *Registry) makeJobID() int64 { 254 return int64(builtins.GenerateUniqueInt(r.nodeID.SQLInstanceID())) 255 } 256 257 // CreateAndStartJob creates and asynchronously starts a job from record. An 258 // error is returned if the job type has not been registered with 259 // RegisterConstructor. The ctx passed to this function is not the context the 260 // job will be started with (canceling ctx will not cause the job to cancel). 261 func (r *Registry) CreateAndStartJob( 262 ctx context.Context, resultsCh chan<- tree.Datums, record Record, 263 ) (*Job, <-chan error, error) { 264 var rj *StartableJob 265 if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) (err error) { 266 rj, err = r.CreateStartableJobWithTxn(ctx, record, txn, resultsCh) 267 return err 268 }); err != nil { 269 return nil, nil, err 270 } 271 errCh, err := rj.Start(ctx) 272 if err != nil { 273 return nil, nil, err 274 } 275 return rj.Job, errCh, nil 276 } 277 278 // Run starts previously unstarted jobs from a list of scheduled 279 // jobs. Canceling ctx interrupts the waiting but doesn't cancel the jobs. 280 func (r *Registry) Run(ctx context.Context, ex sqlutil.InternalExecutor, jobs []int64) error { 281 if len(jobs) == 0 { 282 return nil 283 } 284 log.Infof(ctx, "scheduled jobs %+v", jobs) 285 buf := bytes.Buffer{} 286 for i, id := range jobs { 287 select { 288 case r.adoptionCh <- struct{}{}: 289 case <-ctx.Done(): 290 return ctx.Err() 291 } 292 293 if i > 0 { 294 buf.WriteString(",") 295 } 296 buf.WriteString(fmt.Sprintf(" %d", id)) 297 } 298 // Manually retry instead of using SHOW JOBS WHEN COMPLETE so we have greater 299 // control over retries. Also, avoiding SHOW JOBS prevents us from having to 300 // populate the crdb_internal.jobs vtable. 301 query := fmt.Sprintf( 302 `SELECT count(*) FROM system.jobs WHERE id IN (%s) 303 AND (status != 'succeeded' AND status != 'failed' AND status != 'canceled')`, 304 buf.String()) 305 for r := retry.StartWithCtx(ctx, retry.Options{ 306 InitialBackoff: 10 * time.Millisecond, 307 MaxBackoff: 1 * time.Second, 308 Multiplier: 2, 309 }); r.Next(); { 310 // We poll the number of queued jobs that aren't finished. As with SHOW JOBS 311 // WHEN COMPLETE, if one of the jobs is missing from the jobs table for 312 // whatever reason, we'll fail later when we try to load the job. 313 row, err := ex.QueryRowEx( 314 ctx, 315 "poll-show-jobs", 316 nil, /* txn */ 317 sqlbase.InternalExecutorSessionDataOverride{User: security.RootUser}, 318 query, 319 ) 320 if err != nil { 321 return errors.Wrap(err, "polling for queued jobs to complete") 322 } 323 count := int64(tree.MustBeDInt(row[0])) 324 if log.V(3) { 325 log.Infof(ctx, "waiting for %d queued jobs to complete", count) 326 } 327 if count == 0 { 328 break 329 } 330 } 331 for i, id := range jobs { 332 j, err := r.LoadJob(ctx, id) 333 if err != nil { 334 return errors.WithHint( 335 errors.Wrapf(err, "job %d could not be loaded", jobs[i]), 336 "The job may not have succeeded.") 337 } 338 if j.Payload().FinalResumeError != nil { 339 decodedErr := errors.DecodeError(ctx, *j.Payload().FinalResumeError) 340 return decodedErr 341 } 342 if j.Payload().Error != "" { 343 return errors.Newf("job %d failed with error: %s", jobs[i], j.Payload().Error) 344 } 345 } 346 return nil 347 } 348 349 // NewJob creates a new Job. 350 func (r *Registry) NewJob(record Record) *Job { 351 job := &Job{ 352 registry: r, 353 } 354 job.mu.payload = jobspb.Payload{ 355 Description: record.Description, 356 Statement: record.Statement, 357 Username: record.Username, 358 DescriptorIDs: record.DescriptorIDs, 359 Details: jobspb.WrapPayloadDetails(record.Details), 360 Noncancelable: record.NonCancelable, 361 } 362 job.mu.progress = jobspb.Progress{ 363 Details: jobspb.WrapProgressDetails(record.Progress), 364 RunningStatus: string(record.RunningStatus), 365 } 366 return job 367 } 368 369 // CreateJobWithTxn creates a job to be started later with StartJob. 370 // It stores the job in the jobs table, marks it pending and gives the 371 // current node a lease. 372 func (r *Registry) CreateJobWithTxn(ctx context.Context, record Record, txn *kv.Txn) (*Job, error) { 373 j := r.NewJob(record) 374 if err := j.WithTxn(txn).insert(ctx, r.makeJobID(), r.newLease()); err != nil { 375 return nil, err 376 } 377 return j, nil 378 } 379 380 // CreateStartableJobWithTxn creates a job to be started later, after the 381 // creating txn commits. The method uses the passed txn to write the job in the 382 // jobs table, marks it pending and gives the current node a lease. It 383 // additionally registers the job with the Registry which will prevent the 384 // Registry from adopting the job after the transaction commits. The resultsCh 385 // will be connected to the output of the job and written to after the returned 386 // StartableJob is started. 387 // 388 // The returned job is not associated with the user transaction. The intention 389 // is that the job will not be modified again in txn. If the transaction is 390 // committed, the caller must explicitly Start it. If the transaction is rolled 391 // back then the caller must call CleanupOnRollback to unregister the job from 392 // the Registry. 393 func (r *Registry) CreateStartableJobWithTxn( 394 ctx context.Context, record Record, txn *kv.Txn, resultsCh chan<- tree.Datums, 395 ) (*StartableJob, error) { 396 j, err := r.CreateJobWithTxn(ctx, record, txn) 397 if err != nil { 398 return nil, err 399 } 400 // The job itself must not hold on to this transaction. We ensure in Start() 401 // that the transaction used to create the job is committed. When jobs hold 402 // onto transactions they use the transaction in methods which modify the job. 403 // On the whole this pattern is bug-prone and hard to reason about. 404 j.WithTxn(nil) 405 resumer, err := r.createResumer(j, r.settings) 406 if err != nil { 407 return nil, err 408 } 409 resumerCtx, cancel := r.makeCtx() 410 if err := r.register(*j.ID(), cancel); err != nil { 411 return nil, err 412 } 413 return &StartableJob{ 414 Job: j, 415 txn: txn, 416 resumer: resumer, 417 resumerCtx: resumerCtx, 418 cancel: cancel, 419 resultsCh: resultsCh, 420 }, nil 421 } 422 423 // LoadJob loads an existing job with the given jobID from the system.jobs 424 // table. 425 func (r *Registry) LoadJob(ctx context.Context, jobID int64) (*Job, error) { 426 return r.LoadJobWithTxn(ctx, jobID, nil) 427 } 428 429 // LoadJobWithTxn does the same as above, but using the transaction passed in 430 // the txn argument. Passing a nil transaction is equivalent to calling LoadJob 431 // in that a transaction will be automatically created. 432 func (r *Registry) LoadJobWithTxn(ctx context.Context, jobID int64, txn *kv.Txn) (*Job, error) { 433 j := &Job{ 434 id: &jobID, 435 registry: r, 436 } 437 if err := j.WithTxn(txn).load(ctx); err != nil { 438 return nil, err 439 } 440 return j, nil 441 } 442 443 // DefaultCancelInterval is a reasonable interval at which to poll this node 444 // for liveness failures and cancel running jobs. 445 var DefaultCancelInterval = base.DefaultTxnHeartbeatInterval 446 447 // DefaultAdoptInterval is a reasonable interval at which to poll system.jobs 448 // for jobs with expired leases. 449 // 450 // DefaultAdoptInterval is mutable for testing. NB: Updates to this value after 451 // Registry.Start has been called will not have any effect. 452 var DefaultAdoptInterval = 30 * time.Second 453 454 var maxAdoptionsPerLoop = envutil.EnvOrDefaultInt(`COCKROACH_JOB_ADOPTIONS_PER_PERIOD`, 10) 455 456 // gcInterval is how often we check for and delete job records older than the 457 // retention limit. 458 const gcInterval = 1 * time.Hour 459 460 // Start polls the current node for liveness failures and cancels all registered 461 // jobs if it observes a failure. Otherwise it starts all the main daemons of 462 // registry that poll the jobs table and start/cancel/gc jobs. 463 func (r *Registry) Start( 464 ctx context.Context, stopper *stop.Stopper, cancelInterval, adoptInterval time.Duration, 465 ) error { 466 // Calling maybeCancelJobs once at the start ensures we have an up-to-date 467 // liveness epoch before we wait out the first cancelInterval. 468 r.maybeCancelJobs(ctx, r.nl) 469 470 stopper.RunWorker(context.Background(), func(ctx context.Context) { 471 for { 472 select { 473 case <-stopper.ShouldStop(): 474 return 475 case <-time.After(cancelInterval): 476 r.maybeCancelJobs(ctx, r.nl) 477 } 478 } 479 }) 480 481 stopper.RunWorker(context.Background(), func(ctx context.Context) { 482 for { 483 select { 484 case <-stopper.ShouldStop(): 485 return 486 case <-time.After(gcInterval): 487 old := timeutil.Now().Add(-1 * gcSetting.Get(&r.settings.SV)) 488 if err := r.cleanupOldJobs(ctx, old); err != nil { 489 log.Warningf(ctx, "error cleaning up old job records: %v", err) 490 } 491 } 492 } 493 }) 494 495 maybeAdoptJobs := func(ctx context.Context, randomizeJobOrder bool) { 496 if r.adoptionDisabled(ctx) { 497 r.cancelAll(ctx) 498 return 499 } 500 if err := r.maybeAdoptJob(ctx, r.nl, randomizeJobOrder); err != nil { 501 log.Errorf(ctx, "error while adopting jobs: %s", err) 502 } 503 } 504 505 stopper.RunWorker(context.Background(), func(ctx context.Context) { 506 for { 507 select { 508 case <-stopper.ShouldStop(): 509 return 510 case <-r.adoptionCh: 511 // Try to adopt the most recently created job. 512 maybeAdoptJobs(ctx, false /* randomizeJobOrder */) 513 case <-time.After(adoptInterval): 514 maybeAdoptJobs(ctx, true /* randomizeJobOrder */) 515 } 516 } 517 }) 518 return nil 519 } 520 521 func (r *Registry) maybeCancelJobs(ctx context.Context, nlw sqlbase.OptionalNodeLiveness) { 522 // Cancel all jobs if the stopper is quiescing. 523 select { 524 case <-r.stopper.ShouldQuiesce(): 525 r.cancelAll(ctx) 526 return 527 default: 528 } 529 530 nl, ok := nlw.Optional(47892) 531 if !ok { 532 // At most one container is running on behalf of a SQL tenant, so it must be 533 // this one, and there's no point canceling anything. 534 // 535 // TODO(ajwerner): don't rely on this. Instead fix this issue: 536 // https://github.com/cockroachdb/cockroach/issues/47892 537 return 538 } 539 liveness, err := nl.Self() 540 if err != nil { 541 if nodeLivenessLogLimiter.ShouldLog() { 542 log.Warningf(ctx, "unable to get node liveness: %s", err) 543 } 544 // Conservatively assume our lease has expired. Abort all jobs. 545 r.cancelAll(ctx) 546 return 547 } 548 549 // If we haven't persisted a liveness record within the leniency 550 // interval, we'll cancel all of our jobs. 551 if !liveness.IsLive(r.lenientNow()) { 552 r.mu.Lock() 553 defer r.mu.Unlock() 554 r.cancelAllLocked(ctx) 555 r.mu.epoch = liveness.Epoch 556 return 557 } 558 } 559 560 // isOrphaned tries to detect if there are no mutations left to be done for the 561 // job which will make it a candidate for garbage collection. Jobs can be left 562 // in such inconsistent state if they fail before being removed from the jobs table. 563 func (r *Registry) isOrphaned(ctx context.Context, payload *jobspb.Payload) (bool, error) { 564 if payload.Type() != jobspb.TypeSchemaChange { 565 return false, nil 566 } 567 for _, id := range payload.DescriptorIDs { 568 pendingMutations := false 569 if err := r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 570 td, err := sqlbase.GetTableDescFromID(ctx, txn, keys.TODOSQLCodec, id) 571 if err != nil { 572 return err 573 } 574 hasAnyMutations := len(td.GetMutations()) != 0 || len(td.GetGCMutations()) != 0 575 hasDropJob := td.DropJobID != 0 576 pendingMutations = hasAnyMutations || hasDropJob 577 return nil 578 }); err != nil { 579 if errors.Is(err, sqlbase.ErrDescriptorNotFound) { 580 // Treat missing table descriptors as no longer relevant for the 581 // job payload. See 582 // https://github.com/cockroachdb/cockroach/45399. 583 continue 584 } 585 return false, err 586 } 587 if pendingMutations { 588 return false, nil 589 } 590 } 591 return true, nil 592 } 593 594 func (r *Registry) cleanupOldJobs(ctx context.Context, olderThan time.Time) error { 595 const stmt = `SELECT id, payload, status, created FROM system.jobs WHERE created < $1 596 ORDER BY created LIMIT 1000` 597 rows, err := r.ex.Query(ctx, "gc-jobs", nil /* txn */, stmt, olderThan) 598 if err != nil { 599 return err 600 } 601 602 toDelete := tree.NewDArray(types.Int) 603 toDelete.Array = make(tree.Datums, 0, len(rows)) 604 oldMicros := timeutil.ToUnixMicros(olderThan) 605 for _, row := range rows { 606 payload, err := UnmarshalPayload(row[1]) 607 if err != nil { 608 return err 609 } 610 remove := false 611 switch Status(*row[2].(*tree.DString)) { 612 case StatusRunning, StatusPending: 613 done, err := r.isOrphaned(ctx, payload) 614 if err != nil { 615 return err 616 } 617 remove = done && row[3].(*tree.DTimestamp).Time.Before(olderThan) 618 case StatusSucceeded, StatusCanceled, StatusFailed: 619 remove = payload.FinishedMicros < oldMicros 620 } 621 if remove { 622 toDelete.Array = append(toDelete.Array, row[0]) 623 } 624 } 625 if len(toDelete.Array) > 0 { 626 log.Infof(ctx, "cleaning up %d expired job records", len(toDelete.Array)) 627 const stmt = `DELETE FROM system.jobs WHERE id = ANY($1)` 628 var nDeleted int 629 if nDeleted, err = r.ex.Exec( 630 ctx, "gc-jobs", nil /* txn */, stmt, toDelete, 631 ); err != nil { 632 return errors.Wrap(err, "deleting old jobs") 633 } 634 if nDeleted != len(toDelete.Array) { 635 return errors.Errorf("asked to delete %d rows but %d were actually deleted", 636 len(toDelete.Array), nDeleted) 637 } 638 } 639 return nil 640 } 641 642 // getJobFn attempts to get a resumer from the given job id. If the job id 643 // does not have a resumer then it returns an error message suitable for users. 644 func (r *Registry) getJobFn(ctx context.Context, txn *kv.Txn, id int64) (*Job, Resumer, error) { 645 job, err := r.LoadJobWithTxn(ctx, id, txn) 646 if err != nil { 647 return nil, nil, err 648 } 649 resumer, err := r.createResumer(job, r.settings) 650 if err != nil { 651 return job, nil, errors.Errorf("job %d is not controllable", id) 652 } 653 return job, resumer, nil 654 } 655 656 // CancelRequested marks the job as cancel-requested using the specified txn (may be nil). 657 func (r *Registry) CancelRequested(ctx context.Context, txn *kv.Txn, id int64) error { 658 job, _, err := r.getJobFn(ctx, txn, id) 659 if err != nil { 660 // Special case schema change jobs to mark the job as canceled. 661 if job != nil { 662 payload := job.Payload() 663 // TODO(mjibson): Use an unfortunate workaround to enable canceling of 664 // schema change jobs by comparing the string description. When a schema 665 // change job fails or is canceled, a new job is created with the ROLL BACK 666 // prefix. These rollback jobs cannot be canceled. We could add a field to 667 // the payload proto to indicate if this job is cancelable or not, but in 668 // a split version cluster an older node could pick up the schema change 669 // and fail to clear/set that field appropriately. Thus it seems that the 670 // safest way for now (i.e., without a larger jobs/schema change refactor) 671 // is to hack this up with a string comparison. 672 if payload.Type() == jobspb.TypeSchemaChange && !strings.HasPrefix(payload.Description, "ROLL BACK") { 673 return job.WithTxn(txn).cancelRequested(ctx, nil) 674 } 675 } 676 return err 677 } 678 return job.WithTxn(txn).cancelRequested(ctx, nil) 679 } 680 681 // PauseRequested marks the job with id as paused-requested using the specified txn (may be nil). 682 func (r *Registry) PauseRequested(ctx context.Context, txn *kv.Txn, id int64) error { 683 job, resumer, err := r.getJobFn(ctx, txn, id) 684 if err != nil { 685 return err 686 } 687 var onPauseRequested onPauseRequestFunc 688 if pr, ok := resumer.(PauseRequester); ok { 689 onPauseRequested = pr.OnPauseRequest 690 } 691 return job.WithTxn(txn).pauseRequested(ctx, onPauseRequested) 692 } 693 694 // Succeeded marks the job with id as succeeded. 695 func (r *Registry) Succeeded(ctx context.Context, txn *kv.Txn, id int64) error { 696 job, _, err := r.getJobFn(ctx, txn, id) 697 if err != nil { 698 return err 699 } 700 return job.WithTxn(txn).succeeded(ctx, nil) 701 } 702 703 // Failed marks the job with id as failed. 704 func (r *Registry) Failed(ctx context.Context, txn *kv.Txn, id int64, causingError error) error { 705 job, _, err := r.getJobFn(ctx, txn, id) 706 if err != nil { 707 return err 708 } 709 return job.WithTxn(txn).failed(ctx, causingError, nil) 710 } 711 712 // Resume resumes the paused job with id using the specified txn (may be nil). 713 func (r *Registry) Resume(ctx context.Context, txn *kv.Txn, id int64) error { 714 job, _, err := r.getJobFn(ctx, txn, id) 715 if err != nil { 716 return err 717 } 718 return job.WithTxn(txn).resumed(ctx) 719 } 720 721 // Resumer is a resumable job, and is associated with a Job object. Jobs can be 722 // paused or canceled at any time. Jobs should call their CheckStatus() or 723 // Progressed() method, which will return an error if the job has been paused or 724 // canceled. 725 // 726 // Resumers are created through registered Constructor functions. 727 // 728 type Resumer interface { 729 // Resume is called when a job is started or resumed. Sending results on the 730 // chan will return them to a user, if a user's session is connected. phs 731 // is a sql.PlanHookState. 732 Resume(ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums) error 733 734 // OnFailOrCancel is called when a job fails or is cancel-requested. 735 // 736 // This method will be called when a registry notices the cancel request, 737 // which is not guaranteed to run on the node where the job is running. So it 738 // cannot assume that any other methods have been called on this Resumer 739 // object. 740 OnFailOrCancel(ctx context.Context, phs interface{}) error 741 } 742 743 // PauseRequester is an extension of Resumer which allows job implementers to inject 744 // logic during the transaction which moves a job to PauseRequested. 745 type PauseRequester interface { 746 Resumer 747 748 // OnPauseRequest is called in the transaction that moves a job to PauseRequested. 749 // If an error is returned, the pause request will fail. phs is a 750 // sql.PlanHookState. 751 OnPauseRequest(ctx context.Context, phs interface{}, txn *kv.Txn, details *jobspb.Progress) error 752 } 753 754 // Constructor creates a resumable job of a certain type. The Resumer is 755 // created on the coordinator each time the job is started/resumed, so it can 756 // hold state. The Resume method is always ran, and can set state on the Resumer 757 // that can be used by the other methods. 758 type Constructor func(job *Job, settings *cluster.Settings) Resumer 759 760 var constructors = make(map[jobspb.Type]Constructor) 761 762 // RegisterConstructor registers a Resumer constructor for a certain job type. 763 func RegisterConstructor(typ jobspb.Type, fn Constructor) { 764 constructors[typ] = fn 765 } 766 767 func (r *Registry) createResumer(job *Job, settings *cluster.Settings) (Resumer, error) { 768 payload := job.Payload() 769 fn := constructors[payload.Type()] 770 if fn == nil { 771 return nil, errors.Errorf("no resumer is available for %s", payload.Type()) 772 } 773 if wrapper := r.TestingResumerCreationKnobs[payload.Type()]; wrapper != nil { 774 return wrapper(fn(job, settings)), nil 775 } 776 return fn(job, settings), nil 777 } 778 779 type retryJobError string 780 781 // retryJobErrorSentinel exists so the errors returned from NewRetryJobError can 782 // be marked with it, allowing more robust detection of retry errors even if 783 // they are wrapped, etc. This was originally introduced to deal with injected 784 // retry errors from testing knobs. 785 var retryJobErrorSentinel = retryJobError("") 786 787 // NewRetryJobError creates a new error that, if returned by a Resumer, 788 // indicates to the jobs registry that the job should be restarted in the 789 // background. 790 func NewRetryJobError(s string) error { 791 return errors.Mark(retryJobError(s), retryJobErrorSentinel) 792 } 793 794 func (r retryJobError) Error() string { 795 return string(r) 796 } 797 798 // stepThroughStateMachine implements the state machine of the job lifecycle. 799 // The job is executed with the ctx, so ctx must only be canceled if the job 800 // should also be canceled. resultsCh is passed to the resumable func and should 801 // be closed by the caller after errCh sends a value. errCh returns an error if 802 // the job was not completed with success. status is the current job status. 803 func (r *Registry) stepThroughStateMachine( 804 ctx context.Context, 805 phs interface{}, 806 resumer Resumer, 807 resultsCh chan<- tree.Datums, 808 job *Job, 809 status Status, 810 jobErr error, 811 ) error { 812 log.Infof(ctx, "job %d: stepping through state %s with error: %v", *job.ID(), status, jobErr) 813 switch status { 814 case StatusRunning: 815 if jobErr != nil { 816 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 817 "job %d: resuming with non-nil error", *job.ID()) 818 } 819 resumeCtx := logtags.AddTag(ctx, "job", *job.ID()) 820 err := resumer.Resume(resumeCtx, phs, resultsCh) 821 if err == nil { 822 return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusSucceeded, nil) 823 } 824 if resumeCtx.Err() != nil { 825 // The context was canceled. Tell the user, but don't attempt to 826 // mark the job as failed because it can be resumed by another node. 827 // 828 // TODO(ajwerner): We'll also end up here if the job was canceled or 829 // paused. We should make this error clearer. 830 return errors.Errorf("job %d: node liveness error: restarting in background", *job.ID()) 831 } 832 // TODO(spaskob): enforce a limit on retries. 833 // TODO(spaskob,lucy): Add metrics on job retries. Consider having a backoff 834 // mechanism (possibly combined with a retry limit). 835 if errors.Is(err, retryJobErrorSentinel) { 836 return errors.Errorf("job %d: %s: restarting in background", *job.ID(), err) 837 } 838 if sErr := (*InvalidStatusError)(nil); errors.As(err, &sErr) { 839 if sErr.status != StatusCancelRequested && sErr.status != StatusPauseRequested { 840 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 841 "job %d: unexpected status %s provided for a running job", *job.ID(), sErr.status) 842 } 843 return sErr 844 } 845 return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusReverting, err) 846 case StatusPauseRequested: 847 return errors.Errorf("job %s", status) 848 case StatusCancelRequested: 849 return errors.Errorf("job %s", status) 850 case StatusPaused: 851 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 852 "job %d: unexpected status %s provided to state machine", *job.ID(), status) 853 case StatusCanceled: 854 if err := job.canceled(ctx, nil); err != nil { 855 // If we can't transactionally mark the job as canceled then it will be 856 // restarted during the next adopt loop and reverting will be retried. 857 return errors.Wrapf(err, "job %d: could not mark as canceled: %v", *job.ID(), jobErr) 858 } 859 return errors.WithSecondaryError(errors.Errorf("job %s", status), jobErr) 860 case StatusSucceeded: 861 if jobErr != nil { 862 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 863 "job %d: successful bu unexpected error provided", *job.ID()) 864 } 865 if err := job.succeeded(ctx, nil); err != nil { 866 // If it didn't succeed, we consider the job as failed and need to go 867 // through reverting state first. 868 // TODO(spaskob): this is silly, we should remove the OnSuccess hooks and 869 // execute them in resume so that the client can handle these errors 870 // better. 871 return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusReverting, errors.Wrapf(err, "could not mark job %d as succeeded", *job.ID())) 872 } 873 return nil 874 case StatusReverting: 875 if err := job.reverted(ctx, jobErr, nil); err != nil { 876 // If we can't transactionally mark the job as reverting then it will be 877 // restarted during the next adopt loop and it will be retried. 878 return errors.Wrapf(err, "job %d: could not mark as reverting: %s", *job.ID(), jobErr) 879 } 880 onFailOrCancelCtx := logtags.AddTag(ctx, "job", *job.ID()) 881 err := resumer.OnFailOrCancel(onFailOrCancelCtx, phs) 882 if successOnFailOrCancel := err == nil; successOnFailOrCancel { 883 // If the job has failed with any error different than canceled we 884 // mark it as Failed. 885 nextStatus := StatusFailed 886 if errors.Is(jobErr, errJobCanceled) { 887 nextStatus = StatusCanceled 888 } 889 return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, nextStatus, jobErr) 890 } 891 if onFailOrCancelCtx.Err() != nil { 892 // The context was canceled. Tell the user, but don't attempt to 893 // mark the job as failed because it can be resumed by another node. 894 return errors.Errorf("job %d: node liveness error: restarting in background", *job.ID()) 895 } 896 if errors.Is(err, retryJobErrorSentinel) { 897 return errors.Errorf("job %d: %s: restarting in background", *job.ID(), err) 898 } 899 if sErr := (*InvalidStatusError)(nil); errors.As(err, &sErr) { 900 if sErr.status != StatusPauseRequested { 901 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 902 "job %d: unexpected status %s provided for a reverting job", *job.ID(), sErr.status) 903 } 904 return sErr 905 } 906 return r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, StatusFailed, errors.Wrapf(err, "job %d: cannot be reverted, manual cleanup may be required", *job.ID())) 907 case StatusFailed: 908 if jobErr == nil { 909 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 910 "job %d: has StatusFailed but no error was provided", *job.ID()) 911 } 912 if err := job.failed(ctx, jobErr, nil); err != nil { 913 // If we can't transactionally mark the job as failed then it will be 914 // restarted during the next adopt loop and reverting will be retried. 915 return errors.Wrapf(err, "job %d: could not mark as failed: %s", *job.ID(), jobErr) 916 } 917 return jobErr 918 default: 919 return errors.NewAssertionErrorWithWrappedErrf(jobErr, 920 "job %d: has unsupported status %s", *job.ID(), status) 921 } 922 } 923 924 // resume starts or resumes a job. If no error is returned then the job was 925 // asynchronously executed. The job is executed with the ctx, so ctx must 926 // only by canceled if the job should also be canceled. resultsCh is passed 927 // to the resumable func and should be closed by the caller after errCh sends 928 // a value. 929 func (r *Registry) resume( 930 ctx context.Context, resumer Resumer, resultsCh chan<- tree.Datums, job *Job, 931 ) (<-chan error, error) { 932 errCh := make(chan error, 1) 933 taskName := fmt.Sprintf(`job-%d`, *job.ID()) 934 if err := r.stopper.RunAsyncTask(ctx, taskName, func(ctx context.Context) { 935 // Bookkeeping. 936 payload := job.Payload() 937 phs, cleanup := r.planFn("resume-"+taskName, payload.Username) 938 defer cleanup() 939 spanName := fmt.Sprintf(`%s-%d`, payload.Type(), *job.ID()) 940 var span opentracing.Span 941 ctx, span = r.ac.AnnotateCtxWithSpan(ctx, spanName) 942 defer span.Finish() 943 944 // Run the actual job. 945 status, err := job.CurrentStatus(ctx) 946 if err == nil { 947 var finalResumeError error 948 if job.Payload().FinalResumeError != nil { 949 finalResumeError = errors.DecodeError(ctx, *job.Payload().FinalResumeError) 950 } 951 err = r.stepThroughStateMachine(ctx, phs, resumer, resultsCh, job, status, finalResumeError) 952 if err != nil { 953 // TODO (lucy): This needs to distinguish between assertion errors in 954 // the job registry and assertion errors in job execution returned from 955 // Resume() or OnFailOrCancel(), and only fail on the former. We have 956 // tests that purposely introduce bad state in order to produce 957 // assertion errors, which shouldn't cause the test to panic. For now, 958 // comment this out. 959 // if errors.HasAssertionFailure(err) { 960 // log.ReportOrPanic(ctx, nil, err.Error()) 961 // } 962 log.Errorf(ctx, "job %d: adoption completed with error %v", *job.ID(), err) 963 } 964 status, err := job.CurrentStatus(ctx) 965 if err != nil { 966 log.Errorf(ctx, "job %d: failed querying status: %v", *job.ID(), err) 967 } else { 968 log.Infof(ctx, "job %d: status %s after adoption finished", *job.ID(), status) 969 } 970 } 971 r.unregister(*job.ID()) 972 errCh <- err 973 }); err != nil { 974 return nil, err 975 } 976 return errCh, nil 977 } 978 979 func (r *Registry) adoptionDisabled(ctx context.Context) bool { 980 if r.preventAdoptionFile != "" { 981 if _, err := os.Stat(r.preventAdoptionFile); err != nil { 982 if !os.IsNotExist(err) { 983 log.Warningf(ctx, "error checking if job adoption is currently disabled: %v", err) 984 } 985 return false 986 } 987 log.Warningf(ctx, "job adoption is currently disabled by existence of %s", r.preventAdoptionFile) 988 return true 989 } 990 return false 991 } 992 993 func (r *Registry) maybeAdoptJob( 994 ctx context.Context, nlw sqlbase.OptionalNodeLiveness, randomizeJobOrder bool, 995 ) error { 996 const stmt = ` 997 SELECT id, payload, progress IS NULL, status 998 FROM system.jobs 999 WHERE status IN ($1, $2, $3, $4, $5) ORDER BY created DESC` 1000 rows, err := r.ex.Query( 1001 ctx, "adopt-job", nil /* txn */, stmt, 1002 StatusPending, StatusRunning, StatusCancelRequested, StatusPauseRequested, StatusReverting, 1003 ) 1004 if err != nil { 1005 return errors.Wrap(err, "failed querying for jobs") 1006 } 1007 1008 if randomizeJobOrder { 1009 rand.Seed(timeutil.Now().UnixNano()) 1010 rand.Shuffle(len(rows), func(i, j int) { rows[i], rows[j] = rows[j], rows[i] }) 1011 } 1012 1013 type nodeStatus struct { 1014 isLive bool 1015 } 1016 nodeStatusMap := map[roachpb.NodeID]*nodeStatus{ 1017 // 0 is not a valid node ID, but we treat it as an always-dead node so that 1018 // the empty lease (Lease{}) is always considered expired. 1019 0: {isLive: false}, 1020 } 1021 // If no liveness is available, adopt all jobs. This is reasonable because this 1022 // only affects SQL tenants, which have at most one SQL server running on their 1023 // behalf at any given time. 1024 if nl, ok := nlw.Optional(47892); ok { 1025 // We subtract the leniency interval here to artificially 1026 // widen the range of times over which the job registry will 1027 // consider the node to be alive. We rely on the fact that 1028 // only a live node updates its own expiration. Thus, the 1029 // expiration time can be used as a reasonable measure of 1030 // when the node was last seen. 1031 now := r.lenientNow() 1032 for _, liveness := range nl.GetLivenesses() { 1033 nodeStatusMap[liveness.NodeID] = &nodeStatus{ 1034 isLive: liveness.IsLive(now), 1035 } 1036 1037 // Don't try to start any more jobs unless we're really live, 1038 // otherwise we'd just immediately cancel them. 1039 if liveness.NodeID == r.nodeID.DeprecatedNodeID(multiTenancyIssueNo) { 1040 if !liveness.IsLive(r.clock.Now().GoTime()) { 1041 return errors.Errorf( 1042 "trying to adopt jobs on node %d which is not live", liveness.NodeID) 1043 } 1044 } 1045 } 1046 } 1047 1048 if log.V(3) { 1049 log.Infof(ctx, "evaluating %d jobs for adoption", len(rows)) 1050 } 1051 1052 var adopted int 1053 for _, row := range rows { 1054 if adopted >= maxAdoptionsPerLoop { 1055 // Leave excess jobs for other nodes to get their fair share. 1056 break 1057 } 1058 1059 id := (*int64)(row[0].(*tree.DInt)) 1060 1061 payload, err := UnmarshalPayload(row[1]) 1062 if err != nil { 1063 return err 1064 } 1065 1066 status := Status(tree.MustBeDString(row[3])) 1067 if log.V(3) { 1068 log.Infof(ctx, "job %d: evaluating for adoption with status `%s` and lease %v", 1069 *id, status, payload.Lease) 1070 } 1071 1072 // In version 20.1, the registry must not adopt 19.2-style schema change 1073 // jobs until they've undergone a migration. 1074 // TODO (lucy): Remove this in 20.2. 1075 if isOldSchemaChangeJob(payload) { 1076 log.VEventf(ctx, 2, "job %d: skipping adoption because schema change job has not been migrated", id) 1077 continue 1078 } 1079 1080 if payload.Lease == nil { 1081 // If the lease is missing, it simply means the job does not yet support 1082 // resumability. 1083 if log.V(2) { 1084 log.Infof(ctx, "job %d: skipping: nil lease", *id) 1085 } 1086 continue 1087 } 1088 1089 // If the job has no progress it is from a 2.0 cluster. If the entire cluster 1090 // has been upgraded to 2.1 then we know nothing is running the job and it 1091 // can be safely failed. 1092 if nullProgress, ok := row[2].(*tree.DBool); ok && bool(*nullProgress) { 1093 log.Warningf(ctx, "job %d predates cluster upgrade and must be re-run", *id) 1094 versionErr := errors.New("job predates cluster upgrade and must be re-run") 1095 payload.Error = versionErr.Error() 1096 payloadBytes, err := protoutil.Marshal(payload) 1097 if err != nil { 1098 return err 1099 } 1100 1101 // We can't use job.update here because it fails while attempting to unmarshal 1102 // the progress. Setting the status to failed is idempotent so we don't care 1103 // if multiple nodes execute this. 1104 const updateStmt = `UPDATE system.jobs SET status = $1, payload = $2 WHERE id = $3` 1105 updateArgs := []interface{}{StatusFailed, payloadBytes, *id} 1106 err = r.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1107 _, err := r.ex.Exec(ctx, "job-update", txn, updateStmt, updateArgs...) 1108 return err 1109 }) 1110 if err != nil { 1111 log.Warningf(ctx, "job %d: has no progress but unable to mark failed: %s", *id, err) 1112 } 1113 continue 1114 } 1115 1116 r.mu.Lock() 1117 _, runningOnNode := r.mu.jobs[*id] 1118 r.mu.Unlock() 1119 1120 // If we're running as a tenant (!ok), then we are the sole SQL server in 1121 // charge of its jobs and ought to adopt all of them. Otherwise, look more 1122 // closely at who is running the job and whether to adopt. 1123 if nodeID, ok := r.nodeID.OptionalNodeID(); ok && nodeID != payload.Lease.NodeID { 1124 // Another node holds the lease on the job, see if we should steal it. 1125 if runningOnNode { 1126 // If we are currently running a job that another node has the lease on, 1127 // stop running it. 1128 log.Warningf(ctx, "job %d: node %d owns lease; canceling", *id, payload.Lease.NodeID) 1129 r.unregister(*id) 1130 continue 1131 } 1132 nodeStatus, ok := nodeStatusMap[payload.Lease.NodeID] 1133 if !ok { 1134 // This case should never happen. 1135 log.ReportOrPanic(ctx, nil, "job %d: skipping: no liveness record for the job's node %d", 1136 log.Safe(*id), payload.Lease.NodeID) 1137 continue 1138 } 1139 if nodeStatus.isLive { 1140 if log.V(2) { 1141 log.Infof(ctx, "job %d: skipping: another node is live and holds the lease", *id) 1142 } 1143 continue 1144 } 1145 } 1146 1147 // Below we know that this node holds the lease on the job, or that we want 1148 // to adopt it anyway because the leaseholder seems dead. 1149 job := &Job{id: id, registry: r} 1150 resumeCtx, cancel := r.makeCtx() 1151 1152 if pauseRequested := status == StatusPauseRequested; pauseRequested { 1153 if err := job.paused(ctx, func(context.Context, *kv.Txn) error { 1154 r.unregister(*id) 1155 return nil 1156 }); err != nil { 1157 log.Errorf(ctx, "job %d: could not set to paused: %v", *id, err) 1158 continue 1159 } 1160 log.Infof(ctx, "job %d: paused", *id) 1161 continue 1162 } 1163 1164 if cancelRequested := status == StatusCancelRequested; cancelRequested { 1165 if err := job.reverted(ctx, errJobCanceled, func(context.Context, *kv.Txn) error { 1166 // Unregister the job in case it is running on the node. 1167 // Unregister is a no-op for jobs that are not running. 1168 r.unregister(*id) 1169 return nil 1170 }); err != nil { 1171 log.Errorf(ctx, "job %d: could not set to reverting: %v", *id, err) 1172 continue 1173 } 1174 log.Infof(ctx, "job %d: canceled: the job is now reverting", *id) 1175 } else if currentlyRunning := r.register(*id, cancel) != nil; currentlyRunning { 1176 if log.V(3) { 1177 log.Infof(ctx, "job %d: skipping: the job is already running/reverting on this node", *id) 1178 } 1179 continue 1180 } 1181 1182 // Check if job status has changed in the meanwhile. 1183 currentStatus, err := job.CurrentStatus(ctx) 1184 if err != nil { 1185 return err 1186 } 1187 if status != currentStatus { 1188 continue 1189 } 1190 // Adopt job and resume/revert it. 1191 if err := job.adopt(ctx, payload.Lease); err != nil { 1192 r.unregister(*id) 1193 return errors.Wrap(err, "unable to acquire lease") 1194 } 1195 1196 resultsCh := make(chan tree.Datums) 1197 resumer, err := r.createResumer(job, r.settings) 1198 if err != nil { 1199 r.unregister(*id) 1200 return err 1201 } 1202 log.Infof(ctx, "job %d: resuming execution", *id) 1203 errCh, err := r.resume(resumeCtx, resumer, resultsCh, job) 1204 if err != nil { 1205 r.unregister(*id) 1206 return err 1207 } 1208 go func() { 1209 // Drain and ignore results. 1210 for range resultsCh { 1211 } 1212 }() 1213 go func() { 1214 // Wait for the job to finish. No need to print the error because if there 1215 // was one it's been set in the job status already. 1216 <-errCh 1217 close(resultsCh) 1218 }() 1219 1220 adopted++ 1221 } 1222 1223 return nil 1224 } 1225 1226 func (r *Registry) newLease() *jobspb.Lease { 1227 nodeID := r.nodeID.DeprecatedNodeID(multiTenancyIssueNo) 1228 if nodeID == 0 { 1229 panic("jobs.Registry has empty node ID") 1230 } 1231 r.mu.Lock() 1232 defer r.mu.Unlock() 1233 return &jobspb.Lease{NodeID: nodeID, Epoch: r.mu.epoch} 1234 } 1235 1236 func (r *Registry) cancelAll(ctx context.Context) { 1237 r.mu.Lock() 1238 defer r.mu.Unlock() 1239 r.cancelAllLocked(ctx) 1240 } 1241 1242 func (r *Registry) cancelAllLocked(ctx context.Context) { 1243 r.mu.AssertHeld() 1244 for jobID, cancel := range r.mu.jobs { 1245 log.Warningf(ctx, "job %d: canceling due to liveness failure", jobID) 1246 cancel() 1247 } 1248 r.mu.jobs = make(map[int64]context.CancelFunc) 1249 } 1250 1251 // register registers an about to be resumed job in memory so that it can be 1252 // killed and that no one else tries to resume it. This essentially works as a 1253 // barrier that only one function can cross and try to resume the job. 1254 func (r *Registry) register(jobID int64, cancel func()) error { 1255 r.mu.Lock() 1256 defer r.mu.Unlock() 1257 // We need to prevent different routines trying to adopt and resume the job. 1258 if _, alreadyRegistered := r.mu.jobs[jobID]; alreadyRegistered { 1259 return errors.Errorf("job %d: already registered", jobID) 1260 } 1261 r.mu.jobs[jobID] = cancel 1262 return nil 1263 } 1264 1265 func (r *Registry) unregister(jobID int64) { 1266 r.mu.Lock() 1267 defer r.mu.Unlock() 1268 cancel, ok := r.mu.jobs[jobID] 1269 // It is possible for a job to be double unregistered. unregister is always 1270 // called at the end of resume. But it can also be called during cancelAll 1271 // and in the adopt loop under certain circumstances. 1272 if ok { 1273 cancel() 1274 delete(r.mu.jobs, jobID) 1275 } 1276 } 1277 1278 // TestingNudgeAdoptionQueue is used by tests to tell the registry that there is 1279 // a job to be adopted. 1280 func (r *Registry) TestingNudgeAdoptionQueue() { 1281 r.adoptionCh <- struct{}{} 1282 }