github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/backupccl/backup_job.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package backupccl 10 11 import ( 12 "context" 13 "fmt" 14 "math/rand" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/gossip" 18 "github.com/cockroachdb/cockroach/pkg/jobs" 19 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 20 "github.com/cockroachdb/cockroach/pkg/keys" 21 "github.com/cockroachdb/cockroach/pkg/kv" 22 "github.com/cockroachdb/cockroach/pkg/kv/kvserver" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 26 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 27 "github.com/cockroachdb/cockroach/pkg/sql" 28 "github.com/cockroachdb/cockroach/pkg/sql/covering" 29 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 30 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 31 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 32 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 33 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 34 "github.com/cockroachdb/cockroach/pkg/util/hlc" 35 "github.com/cockroachdb/cockroach/pkg/util/log" 36 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 37 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 38 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 39 "github.com/cockroachdb/cockroach/pkg/util/uuid" 40 "github.com/cockroachdb/errors" 41 ) 42 43 // BackupCheckpointInterval is the interval at which backup progress is saved 44 // to durable storage. 45 var BackupCheckpointInterval = time.Minute 46 47 func (r *RowCount) add(other RowCount) { 48 r.DataSize += other.DataSize 49 r.Rows += other.Rows 50 r.IndexEntries += other.IndexEntries 51 } 52 53 func countRows(raw roachpb.BulkOpSummary, pkIDs map[uint64]struct{}) RowCount { 54 res := RowCount{DataSize: raw.DataSize} 55 for id, count := range raw.EntryCounts { 56 if _, ok := pkIDs[id]; ok { 57 res.Rows += count 58 } else { 59 res.IndexEntries += count 60 } 61 } 62 return res 63 } 64 65 func allRangeDescriptors(ctx context.Context, txn *kv.Txn) ([]roachpb.RangeDescriptor, error) { 66 rows, err := txn.Scan(ctx, keys.Meta2Prefix, keys.MetaMax, 0) 67 if err != nil { 68 return nil, errors.Wrapf(err, 69 "unable to scan range descriptors") 70 } 71 72 rangeDescs := make([]roachpb.RangeDescriptor, len(rows)) 73 for i, row := range rows { 74 if err := row.ValueProto(&rangeDescs[i]); err != nil { 75 return nil, errors.NewAssertionErrorWithWrappedErrf(err, 76 "%s: unable to unmarshal range descriptor", row.Key) 77 } 78 } 79 return rangeDescs, nil 80 } 81 82 // coveringFromSpans creates an interval.Covering with a fixed payload from a 83 // slice of roachpb.Spans. 84 func coveringFromSpans(spans []roachpb.Span, payload interface{}) covering.Covering { 85 var c covering.Covering 86 for _, span := range spans { 87 c = append(c, covering.Range{ 88 Start: []byte(span.Key), 89 End: []byte(span.EndKey), 90 Payload: payload, 91 }) 92 } 93 return c 94 } 95 96 // splitAndFilterSpans returns the spans that represent the set difference 97 // (includes - excludes) while also guaranteeing that each output span does not 98 // cross the endpoint of a RangeDescriptor in ranges. 99 func splitAndFilterSpans( 100 includes []roachpb.Span, excludes []roachpb.Span, ranges []roachpb.RangeDescriptor, 101 ) []roachpb.Span { 102 type includeMarker struct{} 103 type excludeMarker struct{} 104 105 includeCovering := coveringFromSpans(includes, includeMarker{}) 106 excludeCovering := coveringFromSpans(excludes, excludeMarker{}) 107 108 var rangeCovering covering.Covering 109 for _, rangeDesc := range ranges { 110 rangeCovering = append(rangeCovering, covering.Range{ 111 Start: []byte(rangeDesc.StartKey), 112 End: []byte(rangeDesc.EndKey), 113 }) 114 } 115 116 splits := covering.OverlapCoveringMerge( 117 []covering.Covering{includeCovering, excludeCovering, rangeCovering}, 118 ) 119 120 var out []roachpb.Span 121 for _, split := range splits { 122 include := false 123 exclude := false 124 for _, payload := range split.Payload.([]interface{}) { 125 switch payload.(type) { 126 case includeMarker: 127 include = true 128 case excludeMarker: 129 exclude = true 130 } 131 } 132 if include && !exclude { 133 out = append(out, roachpb.Span{ 134 Key: roachpb.Key(split.Start), 135 EndKey: roachpb.Key(split.End), 136 }) 137 } 138 } 139 return out 140 } 141 142 // clusterNodeCount returns the approximate number of nodes in the cluster. 143 func clusterNodeCount(gw gossip.DeprecatedGossip) (int, error) { 144 g, err := gw.OptionalErr(47970) 145 if err != nil { 146 return 0, err 147 } 148 var nodes int 149 _ = g.IterateInfos( 150 gossip.KeyNodeIDPrefix, func(_ string, _ gossip.Info) error { 151 nodes++ 152 return nil 153 }, 154 ) 155 return nodes, nil 156 } 157 158 type spanAndTime struct { 159 span roachpb.Span 160 start, end hlc.Timestamp 161 } 162 163 // backup exports a snapshot of every kv entry into ranged sstables. 164 // 165 // The output is an sstable per range with files in the following locations: 166 // - <dir>/<unique_int>.sst 167 // - <dir> is given by the user and may be cloud storage 168 // - Each file contains data for a key range that doesn't overlap with any other 169 // file. 170 func backup( 171 ctx context.Context, 172 db *kv.DB, 173 numClusterNodes int, 174 settings *cluster.Settings, 175 defaultStore cloud.ExternalStorage, 176 storageByLocalityKV map[string]*roachpb.ExternalStorage, 177 job *jobs.Job, 178 backupManifest *BackupManifest, 179 checkpointDesc *BackupManifest, 180 makeExternalStorage cloud.ExternalStorageFactory, 181 encryption *roachpb.FileEncryptionOptions, 182 ) (RowCount, error) { 183 // TODO(dan): Figure out how permissions should work. #6713 is tracking this 184 // for grpc. 185 186 mu := struct { 187 syncutil.Mutex 188 files []BackupManifest_File 189 exported RowCount 190 lastCheckpoint time.Time 191 }{} 192 193 var checkpointMu syncutil.Mutex 194 195 var ranges []roachpb.RangeDescriptor 196 if err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 197 var err error 198 // TODO(benesch): limit the range descriptors we fetch to the ranges that 199 // are actually relevant in the backup to speed up small backups on large 200 // clusters. 201 ranges, err = allRangeDescriptors(ctx, txn) 202 return err 203 }); err != nil { 204 return RowCount{}, err 205 } 206 207 var completedSpans, completedIntroducedSpans []roachpb.Span 208 if checkpointDesc != nil { 209 // TODO(benesch): verify these files, rather than accepting them as truth 210 // blindly. 211 // No concurrency yet, so these assignments are safe. 212 mu.files = checkpointDesc.Files 213 mu.exported = checkpointDesc.EntryCounts 214 for _, file := range checkpointDesc.Files { 215 if file.StartTime.IsEmpty() && !file.EndTime.IsEmpty() { 216 completedIntroducedSpans = append(completedIntroducedSpans, file.Span) 217 } else { 218 completedSpans = append(completedSpans, file.Span) 219 } 220 } 221 } 222 223 // Subtract out any completed spans and split the remaining spans into 224 // range-sized pieces so that we can use the number of completed requests as a 225 // rough measure of progress. 226 spans := splitAndFilterSpans(backupManifest.Spans, completedSpans, ranges) 227 introducedSpans := splitAndFilterSpans(backupManifest.IntroducedSpans, completedIntroducedSpans, ranges) 228 229 allSpans := make([]spanAndTime, 0, len(spans)+len(introducedSpans)) 230 for _, s := range introducedSpans { 231 allSpans = append(allSpans, spanAndTime{span: s, start: hlc.Timestamp{}, end: backupManifest.StartTime}) 232 } 233 for _, s := range spans { 234 allSpans = append(allSpans, spanAndTime{span: s, start: backupManifest.StartTime, end: backupManifest.EndTime}) 235 } 236 237 // Sequential ranges may have clustered leaseholders, for example a 238 // geo-partitioned table likely has all the leaseholders for some contiguous 239 // span of the table (i.e. a partition) pinned to just the nodes in a region. 240 // In such cases, sending spans sequentially may under-utilize the rest of the 241 // cluster given that we have a limit on the number of spans we send out at 242 // a given time. Randomizing the order of spans should help ensure a more even 243 // distribution of work across the cluster regardless of how leaseholders may 244 // or may not be clustered. 245 rand.Shuffle(len(allSpans), func(i, j int) { 246 allSpans[i], allSpans[j] = allSpans[j], allSpans[i] 247 }) 248 249 progressLogger := jobs.NewChunkProgressLogger(job, len(spans), job.FractionCompleted(), jobs.ProgressUpdateOnly) 250 251 pkIDs := make(map[uint64]struct{}) 252 for _, desc := range backupManifest.Descriptors { 253 if t := desc.Table(hlc.Timestamp{}); t != nil { 254 pkIDs[roachpb.BulkOpSummaryID(uint64(t.ID), uint64(t.PrimaryIndex.ID))] = struct{}{} 255 } 256 } 257 258 // We're already limiting these on the server-side, but sending all the 259 // Export requests at once would fill up distsender/grpc/something and cause 260 // all sorts of badness (node liveness timeouts leading to mass leaseholder 261 // transfers, poor performance on SQL workloads, etc) as well as log spam 262 // about slow distsender requests. Rate limit them here, too. 263 // 264 // Each node limits the number of running Export & Import requests it serves 265 // to avoid overloading the network, so multiply that by the number of nodes 266 // in the cluster and use that as the number of outstanding Export requests 267 // for the rate limiting. This attempts to strike a balance between 268 // simplicity, not getting slow distsender log spam, and keeping the server 269 // side limiter full. 270 // 271 // TODO(dan): Make this limiting per node. 272 // 273 // TODO(dan): See if there's some better solution than rate-limiting #14798. 274 maxConcurrentExports := numClusterNodes * int(kvserver.ExportRequestsLimit.Get(&settings.SV)) * 10 275 exportsSem := make(chan struct{}, maxConcurrentExports) 276 277 g := ctxgroup.WithContext(ctx) 278 279 requestFinishedCh := make(chan struct{}, len(spans)) // enough buffer to never block 280 281 // Only start the progress logger if there are spans, otherwise this will 282 // block forever. This is needed for TestBackupRestoreResume which doesn't 283 // have any spans. Users should never hit this. 284 if len(spans) > 0 { 285 g.GoCtx(func(ctx context.Context) error { 286 return progressLogger.Loop(ctx, requestFinishedCh) 287 }) 288 } 289 g.GoCtx(func(ctx context.Context) error { 290 for i := range allSpans { 291 { 292 select { 293 case exportsSem <- struct{}{}: 294 case <-ctx.Done(): 295 // Break the for loop to avoid creating more work - the backup 296 // has failed because either the context has been canceled or an 297 // error has been returned. Either way, Wait() is guaranteed to 298 // return an error now. 299 return ctx.Err() 300 } 301 } 302 303 span := allSpans[i] 304 g.GoCtx(func(ctx context.Context) error { 305 defer func() { <-exportsSem }() 306 header := roachpb.Header{Timestamp: span.end} 307 req := &roachpb.ExportRequest{ 308 RequestHeader: roachpb.RequestHeaderFromSpan(span.span), 309 Storage: defaultStore.Conf(), 310 StorageByLocalityKV: storageByLocalityKV, 311 StartTime: span.start, 312 EnableTimeBoundIteratorOptimization: useTBI.Get(&settings.SV), 313 MVCCFilter: roachpb.MVCCFilter(backupManifest.MVCCFilter), 314 Encryption: encryption, 315 } 316 rawRes, pErr := kv.SendWrappedWith(ctx, db.NonTransactionalSender(), header, req) 317 if pErr != nil { 318 return errors.Wrapf(pErr.GoError(), "exporting %s", span.span) 319 } 320 res := rawRes.(*roachpb.ExportResponse) 321 322 mu.Lock() 323 if backupManifest.RevisionStartTime.Less(res.StartTime) { 324 backupManifest.RevisionStartTime = res.StartTime 325 } 326 for _, file := range res.Files { 327 f := BackupManifest_File{ 328 Span: file.Span, 329 Path: file.Path, 330 Sha512: file.Sha512, 331 EntryCounts: countRows(file.Exported, pkIDs), 332 LocalityKV: file.LocalityKV, 333 } 334 if span.start != backupManifest.StartTime { 335 f.StartTime = span.start 336 f.EndTime = span.end 337 } 338 mu.files = append(mu.files, f) 339 mu.exported.add(f.EntryCounts) 340 } 341 var checkpointFiles BackupFileDescriptors 342 if timeutil.Since(mu.lastCheckpoint) > BackupCheckpointInterval { 343 // We optimistically assume the checkpoint will succeed to prevent 344 // multiple threads from attempting to checkpoint. 345 mu.lastCheckpoint = timeutil.Now() 346 checkpointFiles = append(checkpointFiles, mu.files...) 347 } 348 mu.Unlock() 349 350 requestFinishedCh <- struct{}{} 351 352 if checkpointFiles != nil { 353 // Make a copy while holding mu to avoid races while marshaling the 354 // manifest into the checkpoint file. 355 mu.Lock() 356 maninfestCopy := *backupManifest 357 mu.Unlock() 358 359 checkpointMu.Lock() 360 maninfestCopy.Files = checkpointFiles 361 err := writeBackupManifest( 362 ctx, settings, defaultStore, BackupManifestCheckpointName, encryption, &maninfestCopy, 363 ) 364 checkpointMu.Unlock() 365 if err != nil { 366 log.Errorf(ctx, "unable to checkpoint backup descriptor: %+v", err) 367 } 368 } 369 return nil 370 }) 371 } 372 return nil 373 }) 374 375 if err := g.Wait(); err != nil { 376 return RowCount{}, errors.Wrapf(err, "exporting %d ranges", errors.Safe(len(spans))) 377 } 378 379 // No more concurrency, so no need to acquire locks below. 380 381 backupManifest.Files = mu.files 382 backupManifest.EntryCounts = mu.exported 383 384 backupID := uuid.MakeV4() 385 backupManifest.ID = backupID 386 // Write additional partial descriptors to each node for partitioned backups. 387 if len(storageByLocalityKV) > 0 { 388 filesByLocalityKV := make(map[string][]BackupManifest_File) 389 for i := range mu.files { 390 file := &mu.files[i] 391 filesByLocalityKV[file.LocalityKV] = append(filesByLocalityKV[file.LocalityKV], *file) 392 } 393 394 nextPartitionedDescFilenameID := 1 395 for kv, conf := range storageByLocalityKV { 396 backupManifest.LocalityKVs = append(backupManifest.LocalityKVs, kv) 397 // Set a unique filename for each partition backup descriptor. The ID 398 // ensures uniqueness, and the kv string appended to the end is for 399 // readability. 400 filename := fmt.Sprintf("%s_%d_%s", 401 BackupPartitionDescriptorPrefix, nextPartitionedDescFilenameID, sanitizeLocalityKV(kv)) 402 nextPartitionedDescFilenameID++ 403 backupManifest.PartitionDescriptorFilenames = append(backupManifest.PartitionDescriptorFilenames, filename) 404 desc := BackupPartitionDescriptor{ 405 LocalityKV: kv, 406 Files: filesByLocalityKV[kv], 407 BackupID: backupID, 408 } 409 410 if err := func() error { 411 store, err := makeExternalStorage(ctx, *conf) 412 if err != nil { 413 return err 414 } 415 defer store.Close() 416 return writeBackupPartitionDescriptor(ctx, store, filename, encryption, &desc) 417 }(); err != nil { 418 return RowCount{}, err 419 } 420 } 421 } 422 423 if err := writeBackupManifest(ctx, settings, defaultStore, BackupManifestName, encryption, backupManifest); err != nil { 424 return RowCount{}, err 425 } 426 427 return mu.exported, nil 428 } 429 430 func (b *backupResumer) releaseProtectedTimestamp( 431 ctx context.Context, txn *kv.Txn, pts protectedts.Storage, 432 ) error { 433 details := b.job.Details().(jobspb.BackupDetails) 434 ptsID := details.ProtectedTimestampRecord 435 // If the job doesn't have a protected timestamp then there's nothing to do. 436 if ptsID == nil { 437 return nil 438 } 439 err := pts.Release(ctx, txn, *ptsID) 440 if errors.Is(err, protectedts.ErrNotExists) { 441 // No reason to return an error which might cause problems if it doesn't 442 // seem to exist. 443 log.Warningf(ctx, "failed to release protected which seems not to exist: %v", err) 444 err = nil 445 } 446 return err 447 } 448 449 type backupResumer struct { 450 job *jobs.Job 451 452 testingKnobs struct { 453 ignoreProtectedTimestamps bool 454 } 455 } 456 457 // Resume is part of the jobs.Resumer interface. 458 func (b *backupResumer) Resume( 459 ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums, 460 ) error { 461 details := b.job.Details().(jobspb.BackupDetails) 462 p := phs.(sql.PlanHookState) 463 464 ptsID := details.ProtectedTimestampRecord 465 if ptsID != nil && !b.testingKnobs.ignoreProtectedTimestamps { 466 if err := p.ExecCfg().ProtectedTimestampProvider.Verify(ctx, *ptsID); err != nil { 467 if errors.Is(err, protectedts.ErrNotExists) { 468 // No reason to return an error which might cause problems if it doesn't 469 // seem to exist. 470 log.Warningf(ctx, "failed to release protected which seems not to exist: %v", err) 471 } else { 472 return err 473 } 474 } 475 } 476 477 if len(details.BackupManifest) == 0 { 478 return errors.Newf("missing backup descriptor; cannot resume a backup from an older version") 479 } 480 481 var backupManifest BackupManifest 482 if err := protoutil.Unmarshal(details.BackupManifest, &backupManifest); err != nil { 483 return pgerror.Wrapf(err, pgcode.DataCorrupted, 484 "unmarshal backup descriptor") 485 } 486 // For all backups, partitioned or not, the main BACKUP manifest is stored at 487 // details.URI. 488 defaultConf, err := cloud.ExternalStorageConfFromURI(details.URI) 489 if err != nil { 490 return errors.Wrapf(err, "export configuration") 491 } 492 defaultStore, err := p.ExecCfg().DistSQLSrv.ExternalStorage(ctx, defaultConf) 493 if err != nil { 494 return errors.Wrapf(err, "make storage") 495 } 496 storageByLocalityKV := make(map[string]*roachpb.ExternalStorage) 497 for kv, uri := range details.URIsByLocalityKV { 498 conf, err := cloud.ExternalStorageConfFromURI(uri) 499 if err != nil { 500 return err 501 } 502 storageByLocalityKV[kv] = &conf 503 } 504 var checkpointDesc *BackupManifest 505 506 // We don't read the table descriptors from the backup descriptor, but 507 // they could be using either the new or the old foreign key 508 // representations. We should just preserve whatever representation the 509 // table descriptors were using and leave them alone. 510 if desc, err := readBackupManifest(ctx, defaultStore, BackupManifestCheckpointName, details.Encryption); err == nil { 511 // If the checkpoint is from a different cluster, it's meaningless to us. 512 // More likely though are dummy/lock-out checkpoints with no ClusterID. 513 if desc.ClusterID.Equal(p.ExecCfg().ClusterID()) { 514 checkpointDesc = &desc 515 } 516 } else { 517 // TODO(benesch): distinguish between a missing checkpoint, which simply 518 // indicates the prior backup attempt made no progress, and a corrupted 519 // checkpoint, which is more troubling. Sadly, storageccl doesn't provide a 520 // "not found" error that's consistent across all ExternalStorage 521 // implementations. 522 log.Warningf(ctx, "unable to load backup checkpoint while resuming job %d: %v", *b.job.ID(), err) 523 } 524 525 numClusterNodes, err := clusterNodeCount(p.ExecCfg().Gossip) 526 if err != nil { 527 return err 528 } 529 530 res, err := backup( 531 ctx, 532 p.ExecCfg().DB, 533 numClusterNodes, 534 p.ExecCfg().Settings, 535 defaultStore, 536 storageByLocalityKV, 537 b.job, 538 &backupManifest, 539 checkpointDesc, 540 p.ExecCfg().DistSQLSrv.ExternalStorage, 541 details.Encryption, 542 ) 543 if err != nil { 544 return err 545 } 546 547 err = b.clearStats(ctx, p.ExecCfg().DB) 548 if err != nil { 549 log.Warningf(ctx, "unable to clear stats from job payload: %+v", err) 550 } 551 b.deleteCheckpoint(ctx, p.ExecCfg()) 552 553 if ptsID != nil && !b.testingKnobs.ignoreProtectedTimestamps { 554 if err := p.ExecCfg().DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 555 return b.releaseProtectedTimestamp(ctx, txn, p.ExecCfg().ProtectedTimestampProvider) 556 }); err != nil { 557 log.Errorf(ctx, "failed to release protected timestamp: %v", err) 558 } 559 } 560 561 resultsCh <- tree.Datums{ 562 tree.NewDInt(tree.DInt(*b.job.ID())), 563 tree.NewDString(string(jobs.StatusSucceeded)), 564 tree.NewDFloat(tree.DFloat(1.0)), 565 tree.NewDInt(tree.DInt(res.Rows)), 566 tree.NewDInt(tree.DInt(res.IndexEntries)), 567 tree.NewDInt(tree.DInt(res.DataSize)), 568 } 569 570 // Collect telemetry. 571 { 572 telemetry.Count("backup.total.succeeded") 573 const mb = 1 << 20 574 sizeMb := res.DataSize / mb 575 sec := int64(timeutil.Since(timeutil.FromUnixMicros(b.job.Payload().StartedMicros)).Seconds()) 576 var mbps int64 577 if sec > 0 { 578 mbps = mb / sec 579 } 580 if details.StartTime.IsEmpty() { 581 telemetry.CountBucketed("backup.duration-sec.full-succeeded", sec) 582 telemetry.CountBucketed("backup.size-mb.full", sizeMb) 583 telemetry.CountBucketed("backup.speed-mbps.full.total", mbps) 584 telemetry.CountBucketed("backup.speed-mbps.full.per-node", mbps/int64(numClusterNodes)) 585 } else { 586 telemetry.CountBucketed("backup.duration-sec.inc-succeeded", sec) 587 telemetry.CountBucketed("backup.size-mb.inc", sizeMb) 588 telemetry.CountBucketed("backup.speed-mbps.inc.total", mbps) 589 telemetry.CountBucketed("backup.speed-mbps.inc.per-node", mbps/int64(numClusterNodes)) 590 } 591 } 592 593 return nil 594 } 595 596 func (b *backupResumer) clearStats(ctx context.Context, DB *kv.DB) error { 597 details := b.job.Details().(jobspb.BackupDetails) 598 var backupManifest BackupManifest 599 if err := protoutil.Unmarshal(details.BackupManifest, &backupManifest); err != nil { 600 return err 601 } 602 backupManifest.Statistics = nil 603 descBytes, err := protoutil.Marshal(&backupManifest) 604 if err != nil { 605 return err 606 } 607 details.BackupManifest = descBytes 608 err = DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 609 return b.job.WithTxn(txn).SetDetails(ctx, details) 610 }) 611 return err 612 } 613 614 // OnFailOrCancel is part of the jobs.Resumer interface. 615 func (b *backupResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error { 616 telemetry.Count("backup.total.failed") 617 telemetry.CountBucketed("backup.duration-sec.failed", 618 int64(timeutil.Since(timeutil.FromUnixMicros(b.job.Payload().StartedMicros)).Seconds())) 619 620 cfg := phs.(sql.PlanHookState).ExecCfg() 621 b.deleteCheckpoint(ctx, cfg) 622 return cfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 623 return b.releaseProtectedTimestamp(ctx, txn, cfg.ProtectedTimestampProvider) 624 }) 625 } 626 627 func (b *backupResumer) deleteCheckpoint(ctx context.Context, cfg *sql.ExecutorConfig) { 628 // Attempt to delete BACKUP-CHECKPOINT. 629 if err := func() error { 630 details := b.job.Details().(jobspb.BackupDetails) 631 // For all backups, partitioned or not, the main BACKUP manifest is stored at 632 // details.URI. 633 conf, err := cloud.ExternalStorageConfFromURI(details.URI) 634 if err != nil { 635 return err 636 } 637 exportStore, err := cfg.DistSQLSrv.ExternalStorage(ctx, conf) 638 if err != nil { 639 return err 640 } 641 return exportStore.Delete(ctx, BackupManifestCheckpointName) 642 }(); err != nil { 643 log.Warningf(ctx, "unable to delete checkpointed backup descriptor: %+v", err) 644 } 645 } 646 647 var _ jobs.Resumer = &backupResumer{} 648 649 func init() { 650 jobs.RegisterConstructor( 651 jobspb.TypeBackup, 652 func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer { 653 return &backupResumer{ 654 job: job, 655 } 656 }, 657 ) 658 }