github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/backupccl/backup_job.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package backupccl
    10  
    11  import (
    12  	"context"
    13  	"fmt"
    14  	"math/rand"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/gossip"
    18  	"github.com/cockroachdb/cockroach/pkg/jobs"
    19  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    20  	"github.com/cockroachdb/cockroach/pkg/keys"
    21  	"github.com/cockroachdb/cockroach/pkg/kv"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    26  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    27  	"github.com/cockroachdb/cockroach/pkg/sql"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/covering"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    32  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    33  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    34  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    35  	"github.com/cockroachdb/cockroach/pkg/util/log"
    36  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    37  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    38  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    39  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    40  	"github.com/cockroachdb/errors"
    41  )
    42  
    43  // BackupCheckpointInterval is the interval at which backup progress is saved
    44  // to durable storage.
    45  var BackupCheckpointInterval = time.Minute
    46  
    47  func (r *RowCount) add(other RowCount) {
    48  	r.DataSize += other.DataSize
    49  	r.Rows += other.Rows
    50  	r.IndexEntries += other.IndexEntries
    51  }
    52  
    53  func countRows(raw roachpb.BulkOpSummary, pkIDs map[uint64]struct{}) RowCount {
    54  	res := RowCount{DataSize: raw.DataSize}
    55  	for id, count := range raw.EntryCounts {
    56  		if _, ok := pkIDs[id]; ok {
    57  			res.Rows += count
    58  		} else {
    59  			res.IndexEntries += count
    60  		}
    61  	}
    62  	return res
    63  }
    64  
    65  func allRangeDescriptors(ctx context.Context, txn *kv.Txn) ([]roachpb.RangeDescriptor, error) {
    66  	rows, err := txn.Scan(ctx, keys.Meta2Prefix, keys.MetaMax, 0)
    67  	if err != nil {
    68  		return nil, errors.Wrapf(err,
    69  			"unable to scan range descriptors")
    70  	}
    71  
    72  	rangeDescs := make([]roachpb.RangeDescriptor, len(rows))
    73  	for i, row := range rows {
    74  		if err := row.ValueProto(&rangeDescs[i]); err != nil {
    75  			return nil, errors.NewAssertionErrorWithWrappedErrf(err,
    76  				"%s: unable to unmarshal range descriptor", row.Key)
    77  		}
    78  	}
    79  	return rangeDescs, nil
    80  }
    81  
    82  // coveringFromSpans creates an interval.Covering with a fixed payload from a
    83  // slice of roachpb.Spans.
    84  func coveringFromSpans(spans []roachpb.Span, payload interface{}) covering.Covering {
    85  	var c covering.Covering
    86  	for _, span := range spans {
    87  		c = append(c, covering.Range{
    88  			Start:   []byte(span.Key),
    89  			End:     []byte(span.EndKey),
    90  			Payload: payload,
    91  		})
    92  	}
    93  	return c
    94  }
    95  
    96  // splitAndFilterSpans returns the spans that represent the set difference
    97  // (includes - excludes) while also guaranteeing that each output span does not
    98  // cross the endpoint of a RangeDescriptor in ranges.
    99  func splitAndFilterSpans(
   100  	includes []roachpb.Span, excludes []roachpb.Span, ranges []roachpb.RangeDescriptor,
   101  ) []roachpb.Span {
   102  	type includeMarker struct{}
   103  	type excludeMarker struct{}
   104  
   105  	includeCovering := coveringFromSpans(includes, includeMarker{})
   106  	excludeCovering := coveringFromSpans(excludes, excludeMarker{})
   107  
   108  	var rangeCovering covering.Covering
   109  	for _, rangeDesc := range ranges {
   110  		rangeCovering = append(rangeCovering, covering.Range{
   111  			Start: []byte(rangeDesc.StartKey),
   112  			End:   []byte(rangeDesc.EndKey),
   113  		})
   114  	}
   115  
   116  	splits := covering.OverlapCoveringMerge(
   117  		[]covering.Covering{includeCovering, excludeCovering, rangeCovering},
   118  	)
   119  
   120  	var out []roachpb.Span
   121  	for _, split := range splits {
   122  		include := false
   123  		exclude := false
   124  		for _, payload := range split.Payload.([]interface{}) {
   125  			switch payload.(type) {
   126  			case includeMarker:
   127  				include = true
   128  			case excludeMarker:
   129  				exclude = true
   130  			}
   131  		}
   132  		if include && !exclude {
   133  			out = append(out, roachpb.Span{
   134  				Key:    roachpb.Key(split.Start),
   135  				EndKey: roachpb.Key(split.End),
   136  			})
   137  		}
   138  	}
   139  	return out
   140  }
   141  
   142  // clusterNodeCount returns the approximate number of nodes in the cluster.
   143  func clusterNodeCount(gw gossip.DeprecatedGossip) (int, error) {
   144  	g, err := gw.OptionalErr(47970)
   145  	if err != nil {
   146  		return 0, err
   147  	}
   148  	var nodes int
   149  	_ = g.IterateInfos(
   150  		gossip.KeyNodeIDPrefix, func(_ string, _ gossip.Info) error {
   151  			nodes++
   152  			return nil
   153  		},
   154  	)
   155  	return nodes, nil
   156  }
   157  
   158  type spanAndTime struct {
   159  	span       roachpb.Span
   160  	start, end hlc.Timestamp
   161  }
   162  
   163  // backup exports a snapshot of every kv entry into ranged sstables.
   164  //
   165  // The output is an sstable per range with files in the following locations:
   166  // - <dir>/<unique_int>.sst
   167  // - <dir> is given by the user and may be cloud storage
   168  // - Each file contains data for a key range that doesn't overlap with any other
   169  //   file.
   170  func backup(
   171  	ctx context.Context,
   172  	db *kv.DB,
   173  	numClusterNodes int,
   174  	settings *cluster.Settings,
   175  	defaultStore cloud.ExternalStorage,
   176  	storageByLocalityKV map[string]*roachpb.ExternalStorage,
   177  	job *jobs.Job,
   178  	backupManifest *BackupManifest,
   179  	checkpointDesc *BackupManifest,
   180  	makeExternalStorage cloud.ExternalStorageFactory,
   181  	encryption *roachpb.FileEncryptionOptions,
   182  ) (RowCount, error) {
   183  	// TODO(dan): Figure out how permissions should work. #6713 is tracking this
   184  	// for grpc.
   185  
   186  	mu := struct {
   187  		syncutil.Mutex
   188  		files          []BackupManifest_File
   189  		exported       RowCount
   190  		lastCheckpoint time.Time
   191  	}{}
   192  
   193  	var checkpointMu syncutil.Mutex
   194  
   195  	var ranges []roachpb.RangeDescriptor
   196  	if err := db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   197  		var err error
   198  		// TODO(benesch): limit the range descriptors we fetch to the ranges that
   199  		// are actually relevant in the backup to speed up small backups on large
   200  		// clusters.
   201  		ranges, err = allRangeDescriptors(ctx, txn)
   202  		return err
   203  	}); err != nil {
   204  		return RowCount{}, err
   205  	}
   206  
   207  	var completedSpans, completedIntroducedSpans []roachpb.Span
   208  	if checkpointDesc != nil {
   209  		// TODO(benesch): verify these files, rather than accepting them as truth
   210  		// blindly.
   211  		// No concurrency yet, so these assignments are safe.
   212  		mu.files = checkpointDesc.Files
   213  		mu.exported = checkpointDesc.EntryCounts
   214  		for _, file := range checkpointDesc.Files {
   215  			if file.StartTime.IsEmpty() && !file.EndTime.IsEmpty() {
   216  				completedIntroducedSpans = append(completedIntroducedSpans, file.Span)
   217  			} else {
   218  				completedSpans = append(completedSpans, file.Span)
   219  			}
   220  		}
   221  	}
   222  
   223  	// Subtract out any completed spans and split the remaining spans into
   224  	// range-sized pieces so that we can use the number of completed requests as a
   225  	// rough measure of progress.
   226  	spans := splitAndFilterSpans(backupManifest.Spans, completedSpans, ranges)
   227  	introducedSpans := splitAndFilterSpans(backupManifest.IntroducedSpans, completedIntroducedSpans, ranges)
   228  
   229  	allSpans := make([]spanAndTime, 0, len(spans)+len(introducedSpans))
   230  	for _, s := range introducedSpans {
   231  		allSpans = append(allSpans, spanAndTime{span: s, start: hlc.Timestamp{}, end: backupManifest.StartTime})
   232  	}
   233  	for _, s := range spans {
   234  		allSpans = append(allSpans, spanAndTime{span: s, start: backupManifest.StartTime, end: backupManifest.EndTime})
   235  	}
   236  
   237  	// Sequential ranges may have clustered leaseholders, for example a
   238  	// geo-partitioned table likely has all the leaseholders for some contiguous
   239  	// span of the table (i.e. a partition) pinned to just the nodes in a region.
   240  	// In such cases, sending spans sequentially may under-utilize the rest of the
   241  	// cluster given that we have a limit on the number of spans we send out at
   242  	// a given time. Randomizing the order of spans should help ensure a more even
   243  	// distribution of work across the cluster regardless of how leaseholders may
   244  	// or may not be clustered.
   245  	rand.Shuffle(len(allSpans), func(i, j int) {
   246  		allSpans[i], allSpans[j] = allSpans[j], allSpans[i]
   247  	})
   248  
   249  	progressLogger := jobs.NewChunkProgressLogger(job, len(spans), job.FractionCompleted(), jobs.ProgressUpdateOnly)
   250  
   251  	pkIDs := make(map[uint64]struct{})
   252  	for _, desc := range backupManifest.Descriptors {
   253  		if t := desc.Table(hlc.Timestamp{}); t != nil {
   254  			pkIDs[roachpb.BulkOpSummaryID(uint64(t.ID), uint64(t.PrimaryIndex.ID))] = struct{}{}
   255  		}
   256  	}
   257  
   258  	// We're already limiting these on the server-side, but sending all the
   259  	// Export requests at once would fill up distsender/grpc/something and cause
   260  	// all sorts of badness (node liveness timeouts leading to mass leaseholder
   261  	// transfers, poor performance on SQL workloads, etc) as well as log spam
   262  	// about slow distsender requests. Rate limit them here, too.
   263  	//
   264  	// Each node limits the number of running Export & Import requests it serves
   265  	// to avoid overloading the network, so multiply that by the number of nodes
   266  	// in the cluster and use that as the number of outstanding Export requests
   267  	// for the rate limiting. This attempts to strike a balance between
   268  	// simplicity, not getting slow distsender log spam, and keeping the server
   269  	// side limiter full.
   270  	//
   271  	// TODO(dan): Make this limiting per node.
   272  	//
   273  	// TODO(dan): See if there's some better solution than rate-limiting #14798.
   274  	maxConcurrentExports := numClusterNodes * int(kvserver.ExportRequestsLimit.Get(&settings.SV)) * 10
   275  	exportsSem := make(chan struct{}, maxConcurrentExports)
   276  
   277  	g := ctxgroup.WithContext(ctx)
   278  
   279  	requestFinishedCh := make(chan struct{}, len(spans)) // enough buffer to never block
   280  
   281  	// Only start the progress logger if there are spans, otherwise this will
   282  	// block forever. This is needed for TestBackupRestoreResume which doesn't
   283  	// have any spans. Users should never hit this.
   284  	if len(spans) > 0 {
   285  		g.GoCtx(func(ctx context.Context) error {
   286  			return progressLogger.Loop(ctx, requestFinishedCh)
   287  		})
   288  	}
   289  	g.GoCtx(func(ctx context.Context) error {
   290  		for i := range allSpans {
   291  			{
   292  				select {
   293  				case exportsSem <- struct{}{}:
   294  				case <-ctx.Done():
   295  					// Break the for loop to avoid creating more work - the backup
   296  					// has failed because either the context has been canceled or an
   297  					// error has been returned. Either way, Wait() is guaranteed to
   298  					// return an error now.
   299  					return ctx.Err()
   300  				}
   301  			}
   302  
   303  			span := allSpans[i]
   304  			g.GoCtx(func(ctx context.Context) error {
   305  				defer func() { <-exportsSem }()
   306  				header := roachpb.Header{Timestamp: span.end}
   307  				req := &roachpb.ExportRequest{
   308  					RequestHeader:                       roachpb.RequestHeaderFromSpan(span.span),
   309  					Storage:                             defaultStore.Conf(),
   310  					StorageByLocalityKV:                 storageByLocalityKV,
   311  					StartTime:                           span.start,
   312  					EnableTimeBoundIteratorOptimization: useTBI.Get(&settings.SV),
   313  					MVCCFilter:                          roachpb.MVCCFilter(backupManifest.MVCCFilter),
   314  					Encryption:                          encryption,
   315  				}
   316  				rawRes, pErr := kv.SendWrappedWith(ctx, db.NonTransactionalSender(), header, req)
   317  				if pErr != nil {
   318  					return errors.Wrapf(pErr.GoError(), "exporting %s", span.span)
   319  				}
   320  				res := rawRes.(*roachpb.ExportResponse)
   321  
   322  				mu.Lock()
   323  				if backupManifest.RevisionStartTime.Less(res.StartTime) {
   324  					backupManifest.RevisionStartTime = res.StartTime
   325  				}
   326  				for _, file := range res.Files {
   327  					f := BackupManifest_File{
   328  						Span:        file.Span,
   329  						Path:        file.Path,
   330  						Sha512:      file.Sha512,
   331  						EntryCounts: countRows(file.Exported, pkIDs),
   332  						LocalityKV:  file.LocalityKV,
   333  					}
   334  					if span.start != backupManifest.StartTime {
   335  						f.StartTime = span.start
   336  						f.EndTime = span.end
   337  					}
   338  					mu.files = append(mu.files, f)
   339  					mu.exported.add(f.EntryCounts)
   340  				}
   341  				var checkpointFiles BackupFileDescriptors
   342  				if timeutil.Since(mu.lastCheckpoint) > BackupCheckpointInterval {
   343  					// We optimistically assume the checkpoint will succeed to prevent
   344  					// multiple threads from attempting to checkpoint.
   345  					mu.lastCheckpoint = timeutil.Now()
   346  					checkpointFiles = append(checkpointFiles, mu.files...)
   347  				}
   348  				mu.Unlock()
   349  
   350  				requestFinishedCh <- struct{}{}
   351  
   352  				if checkpointFiles != nil {
   353  					// Make a copy while holding mu to avoid races while marshaling the
   354  					// manifest into the checkpoint file.
   355  					mu.Lock()
   356  					maninfestCopy := *backupManifest
   357  					mu.Unlock()
   358  
   359  					checkpointMu.Lock()
   360  					maninfestCopy.Files = checkpointFiles
   361  					err := writeBackupManifest(
   362  						ctx, settings, defaultStore, BackupManifestCheckpointName, encryption, &maninfestCopy,
   363  					)
   364  					checkpointMu.Unlock()
   365  					if err != nil {
   366  						log.Errorf(ctx, "unable to checkpoint backup descriptor: %+v", err)
   367  					}
   368  				}
   369  				return nil
   370  			})
   371  		}
   372  		return nil
   373  	})
   374  
   375  	if err := g.Wait(); err != nil {
   376  		return RowCount{}, errors.Wrapf(err, "exporting %d ranges", errors.Safe(len(spans)))
   377  	}
   378  
   379  	// No more concurrency, so no need to acquire locks below.
   380  
   381  	backupManifest.Files = mu.files
   382  	backupManifest.EntryCounts = mu.exported
   383  
   384  	backupID := uuid.MakeV4()
   385  	backupManifest.ID = backupID
   386  	// Write additional partial descriptors to each node for partitioned backups.
   387  	if len(storageByLocalityKV) > 0 {
   388  		filesByLocalityKV := make(map[string][]BackupManifest_File)
   389  		for i := range mu.files {
   390  			file := &mu.files[i]
   391  			filesByLocalityKV[file.LocalityKV] = append(filesByLocalityKV[file.LocalityKV], *file)
   392  		}
   393  
   394  		nextPartitionedDescFilenameID := 1
   395  		for kv, conf := range storageByLocalityKV {
   396  			backupManifest.LocalityKVs = append(backupManifest.LocalityKVs, kv)
   397  			// Set a unique filename for each partition backup descriptor. The ID
   398  			// ensures uniqueness, and the kv string appended to the end is for
   399  			// readability.
   400  			filename := fmt.Sprintf("%s_%d_%s",
   401  				BackupPartitionDescriptorPrefix, nextPartitionedDescFilenameID, sanitizeLocalityKV(kv))
   402  			nextPartitionedDescFilenameID++
   403  			backupManifest.PartitionDescriptorFilenames = append(backupManifest.PartitionDescriptorFilenames, filename)
   404  			desc := BackupPartitionDescriptor{
   405  				LocalityKV: kv,
   406  				Files:      filesByLocalityKV[kv],
   407  				BackupID:   backupID,
   408  			}
   409  
   410  			if err := func() error {
   411  				store, err := makeExternalStorage(ctx, *conf)
   412  				if err != nil {
   413  					return err
   414  				}
   415  				defer store.Close()
   416  				return writeBackupPartitionDescriptor(ctx, store, filename, encryption, &desc)
   417  			}(); err != nil {
   418  				return RowCount{}, err
   419  			}
   420  		}
   421  	}
   422  
   423  	if err := writeBackupManifest(ctx, settings, defaultStore, BackupManifestName, encryption, backupManifest); err != nil {
   424  		return RowCount{}, err
   425  	}
   426  
   427  	return mu.exported, nil
   428  }
   429  
   430  func (b *backupResumer) releaseProtectedTimestamp(
   431  	ctx context.Context, txn *kv.Txn, pts protectedts.Storage,
   432  ) error {
   433  	details := b.job.Details().(jobspb.BackupDetails)
   434  	ptsID := details.ProtectedTimestampRecord
   435  	// If the job doesn't have a protected timestamp then there's nothing to do.
   436  	if ptsID == nil {
   437  		return nil
   438  	}
   439  	err := pts.Release(ctx, txn, *ptsID)
   440  	if errors.Is(err, protectedts.ErrNotExists) {
   441  		// No reason to return an error which might cause problems if it doesn't
   442  		// seem to exist.
   443  		log.Warningf(ctx, "failed to release protected which seems not to exist: %v", err)
   444  		err = nil
   445  	}
   446  	return err
   447  }
   448  
   449  type backupResumer struct {
   450  	job *jobs.Job
   451  
   452  	testingKnobs struct {
   453  		ignoreProtectedTimestamps bool
   454  	}
   455  }
   456  
   457  // Resume is part of the jobs.Resumer interface.
   458  func (b *backupResumer) Resume(
   459  	ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums,
   460  ) error {
   461  	details := b.job.Details().(jobspb.BackupDetails)
   462  	p := phs.(sql.PlanHookState)
   463  
   464  	ptsID := details.ProtectedTimestampRecord
   465  	if ptsID != nil && !b.testingKnobs.ignoreProtectedTimestamps {
   466  		if err := p.ExecCfg().ProtectedTimestampProvider.Verify(ctx, *ptsID); err != nil {
   467  			if errors.Is(err, protectedts.ErrNotExists) {
   468  				// No reason to return an error which might cause problems if it doesn't
   469  				// seem to exist.
   470  				log.Warningf(ctx, "failed to release protected which seems not to exist: %v", err)
   471  			} else {
   472  				return err
   473  			}
   474  		}
   475  	}
   476  
   477  	if len(details.BackupManifest) == 0 {
   478  		return errors.Newf("missing backup descriptor; cannot resume a backup from an older version")
   479  	}
   480  
   481  	var backupManifest BackupManifest
   482  	if err := protoutil.Unmarshal(details.BackupManifest, &backupManifest); err != nil {
   483  		return pgerror.Wrapf(err, pgcode.DataCorrupted,
   484  			"unmarshal backup descriptor")
   485  	}
   486  	// For all backups, partitioned or not, the main BACKUP manifest is stored at
   487  	// details.URI.
   488  	defaultConf, err := cloud.ExternalStorageConfFromURI(details.URI)
   489  	if err != nil {
   490  		return errors.Wrapf(err, "export configuration")
   491  	}
   492  	defaultStore, err := p.ExecCfg().DistSQLSrv.ExternalStorage(ctx, defaultConf)
   493  	if err != nil {
   494  		return errors.Wrapf(err, "make storage")
   495  	}
   496  	storageByLocalityKV := make(map[string]*roachpb.ExternalStorage)
   497  	for kv, uri := range details.URIsByLocalityKV {
   498  		conf, err := cloud.ExternalStorageConfFromURI(uri)
   499  		if err != nil {
   500  			return err
   501  		}
   502  		storageByLocalityKV[kv] = &conf
   503  	}
   504  	var checkpointDesc *BackupManifest
   505  
   506  	// We don't read the table descriptors from the backup descriptor, but
   507  	// they could be using either the new or the old foreign key
   508  	// representations. We should just preserve whatever representation the
   509  	// table descriptors were using and leave them alone.
   510  	if desc, err := readBackupManifest(ctx, defaultStore, BackupManifestCheckpointName, details.Encryption); err == nil {
   511  		// If the checkpoint is from a different cluster, it's meaningless to us.
   512  		// More likely though are dummy/lock-out checkpoints with no ClusterID.
   513  		if desc.ClusterID.Equal(p.ExecCfg().ClusterID()) {
   514  			checkpointDesc = &desc
   515  		}
   516  	} else {
   517  		// TODO(benesch): distinguish between a missing checkpoint, which simply
   518  		// indicates the prior backup attempt made no progress, and a corrupted
   519  		// checkpoint, which is more troubling. Sadly, storageccl doesn't provide a
   520  		// "not found" error that's consistent across all ExternalStorage
   521  		// implementations.
   522  		log.Warningf(ctx, "unable to load backup checkpoint while resuming job %d: %v", *b.job.ID(), err)
   523  	}
   524  
   525  	numClusterNodes, err := clusterNodeCount(p.ExecCfg().Gossip)
   526  	if err != nil {
   527  		return err
   528  	}
   529  
   530  	res, err := backup(
   531  		ctx,
   532  		p.ExecCfg().DB,
   533  		numClusterNodes,
   534  		p.ExecCfg().Settings,
   535  		defaultStore,
   536  		storageByLocalityKV,
   537  		b.job,
   538  		&backupManifest,
   539  		checkpointDesc,
   540  		p.ExecCfg().DistSQLSrv.ExternalStorage,
   541  		details.Encryption,
   542  	)
   543  	if err != nil {
   544  		return err
   545  	}
   546  
   547  	err = b.clearStats(ctx, p.ExecCfg().DB)
   548  	if err != nil {
   549  		log.Warningf(ctx, "unable to clear stats from job payload: %+v", err)
   550  	}
   551  	b.deleteCheckpoint(ctx, p.ExecCfg())
   552  
   553  	if ptsID != nil && !b.testingKnobs.ignoreProtectedTimestamps {
   554  		if err := p.ExecCfg().DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   555  			return b.releaseProtectedTimestamp(ctx, txn, p.ExecCfg().ProtectedTimestampProvider)
   556  		}); err != nil {
   557  			log.Errorf(ctx, "failed to release protected timestamp: %v", err)
   558  		}
   559  	}
   560  
   561  	resultsCh <- tree.Datums{
   562  		tree.NewDInt(tree.DInt(*b.job.ID())),
   563  		tree.NewDString(string(jobs.StatusSucceeded)),
   564  		tree.NewDFloat(tree.DFloat(1.0)),
   565  		tree.NewDInt(tree.DInt(res.Rows)),
   566  		tree.NewDInt(tree.DInt(res.IndexEntries)),
   567  		tree.NewDInt(tree.DInt(res.DataSize)),
   568  	}
   569  
   570  	// Collect telemetry.
   571  	{
   572  		telemetry.Count("backup.total.succeeded")
   573  		const mb = 1 << 20
   574  		sizeMb := res.DataSize / mb
   575  		sec := int64(timeutil.Since(timeutil.FromUnixMicros(b.job.Payload().StartedMicros)).Seconds())
   576  		var mbps int64
   577  		if sec > 0 {
   578  			mbps = mb / sec
   579  		}
   580  		if details.StartTime.IsEmpty() {
   581  			telemetry.CountBucketed("backup.duration-sec.full-succeeded", sec)
   582  			telemetry.CountBucketed("backup.size-mb.full", sizeMb)
   583  			telemetry.CountBucketed("backup.speed-mbps.full.total", mbps)
   584  			telemetry.CountBucketed("backup.speed-mbps.full.per-node", mbps/int64(numClusterNodes))
   585  		} else {
   586  			telemetry.CountBucketed("backup.duration-sec.inc-succeeded", sec)
   587  			telemetry.CountBucketed("backup.size-mb.inc", sizeMb)
   588  			telemetry.CountBucketed("backup.speed-mbps.inc.total", mbps)
   589  			telemetry.CountBucketed("backup.speed-mbps.inc.per-node", mbps/int64(numClusterNodes))
   590  		}
   591  	}
   592  
   593  	return nil
   594  }
   595  
   596  func (b *backupResumer) clearStats(ctx context.Context, DB *kv.DB) error {
   597  	details := b.job.Details().(jobspb.BackupDetails)
   598  	var backupManifest BackupManifest
   599  	if err := protoutil.Unmarshal(details.BackupManifest, &backupManifest); err != nil {
   600  		return err
   601  	}
   602  	backupManifest.Statistics = nil
   603  	descBytes, err := protoutil.Marshal(&backupManifest)
   604  	if err != nil {
   605  		return err
   606  	}
   607  	details.BackupManifest = descBytes
   608  	err = DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   609  		return b.job.WithTxn(txn).SetDetails(ctx, details)
   610  	})
   611  	return err
   612  }
   613  
   614  // OnFailOrCancel is part of the jobs.Resumer interface.
   615  func (b *backupResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error {
   616  	telemetry.Count("backup.total.failed")
   617  	telemetry.CountBucketed("backup.duration-sec.failed",
   618  		int64(timeutil.Since(timeutil.FromUnixMicros(b.job.Payload().StartedMicros)).Seconds()))
   619  
   620  	cfg := phs.(sql.PlanHookState).ExecCfg()
   621  	b.deleteCheckpoint(ctx, cfg)
   622  	return cfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   623  		return b.releaseProtectedTimestamp(ctx, txn, cfg.ProtectedTimestampProvider)
   624  	})
   625  }
   626  
   627  func (b *backupResumer) deleteCheckpoint(ctx context.Context, cfg *sql.ExecutorConfig) {
   628  	// Attempt to delete BACKUP-CHECKPOINT.
   629  	if err := func() error {
   630  		details := b.job.Details().(jobspb.BackupDetails)
   631  		// For all backups, partitioned or not, the main BACKUP manifest is stored at
   632  		// details.URI.
   633  		conf, err := cloud.ExternalStorageConfFromURI(details.URI)
   634  		if err != nil {
   635  			return err
   636  		}
   637  		exportStore, err := cfg.DistSQLSrv.ExternalStorage(ctx, conf)
   638  		if err != nil {
   639  			return err
   640  		}
   641  		return exportStore.Delete(ctx, BackupManifestCheckpointName)
   642  	}(); err != nil {
   643  		log.Warningf(ctx, "unable to delete checkpointed backup descriptor: %+v", err)
   644  	}
   645  }
   646  
   647  var _ jobs.Resumer = &backupResumer{}
   648  
   649  func init() {
   650  	jobs.RegisterConstructor(
   651  		jobspb.TypeBackup,
   652  		func(job *jobs.Job, _ *cluster.Settings) jobs.Resumer {
   653  			return &backupResumer{
   654  				job: job,
   655  			}
   656  		},
   657  	)
   658  }