github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/backupccl/restore_job.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package backupccl
    10  
    11  import (
    12  	"bytes"
    13  	"context"
    14  	"fmt"
    15  	"math"
    16  	"runtime"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/ccl/storageccl"
    21  	"github.com/cockroachdb/cockroach/pkg/jobs"
    22  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    23  	"github.com/cockroachdb/cockroach/pkg/keys"
    24  	"github.com/cockroachdb/cockroach/pkg/kv"
    25  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    26  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    27  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    28  	"github.com/cockroachdb/cockroach/pkg/sql"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/covering"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/stats"
    36  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    37  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    38  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    39  	"github.com/cockroachdb/cockroach/pkg/util/interval"
    40  	"github.com/cockroachdb/cockroach/pkg/util/log"
    41  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    42  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    43  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    44  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    45  	"github.com/cockroachdb/errors"
    46  	"github.com/opentracing/opentracing-go"
    47  )
    48  
    49  type intervalSpan roachpb.Span
    50  
    51  var _ interval.Interface = intervalSpan{}
    52  
    53  // ID is part of `interval.Interface` but unused in makeImportSpans.
    54  func (ie intervalSpan) ID() uintptr { return 0 }
    55  
    56  // Range is part of `interval.Interface`.
    57  func (ie intervalSpan) Range() interval.Range {
    58  	return interval.Range{Start: []byte(ie.Key), End: []byte(ie.EndKey)}
    59  }
    60  
    61  type importEntryType int
    62  
    63  const (
    64  	backupSpan importEntryType = iota
    65  	backupFile
    66  	tableSpan
    67  	completedSpan
    68  	request
    69  )
    70  
    71  type importEntry struct {
    72  	roachpb.Span
    73  	entryType importEntryType
    74  
    75  	// Only set if entryType is backupSpan
    76  	start, end hlc.Timestamp
    77  
    78  	// Only set if entryType is backupFile
    79  	dir  roachpb.ExternalStorage
    80  	file BackupManifest_File
    81  
    82  	// Only set if entryType is request
    83  	files []roachpb.ImportRequest_File
    84  
    85  	// for progress tracking we assign the spans numbers as they can be executed
    86  	// out-of-order based on splitAndScatter's scheduling.
    87  	progressIdx int
    88  }
    89  
    90  // makeImportSpans pivots the backups, which are grouped by time, into
    91  // spans for import, which are grouped by keyrange.
    92  //
    93  // The core logic of this is in OverlapCoveringMerge, which accepts sets of
    94  // non-overlapping key ranges (aka coverings) each with a payload, and returns
    95  // them aligned with the payloads in the same order as in the input.
    96  //
    97  // Example (input):
    98  // - [A, C) backup t0 to t1 -> /file1
    99  // - [C, D) backup t0 to t1 -> /file2
   100  // - [A, B) backup t1 to t2 -> /file3
   101  // - [B, C) backup t1 to t2 -> /file4
   102  // - [C, D) backup t1 to t2 -> /file5
   103  // - [B, D) requested table data to be restored
   104  //
   105  // Example (output):
   106  // - [A, B) -> /file1, /file3
   107  // - [B, C) -> /file1, /file4, requested (note that file1 was split into two ranges)
   108  // - [C, D) -> /file2, /file5, requested
   109  //
   110  // This would be turned into two Import spans, one restoring [B, C) out of
   111  // /file1 and /file4, the other restoring [C, D) out of /file2 and /file5.
   112  // Nothing is restored out of /file3 and only part of /file1 is used.
   113  //
   114  // NB: All grouping operates in the pre-rewrite keyspace, meaning the keyranges
   115  // as they were backed up, not as they're being restored.
   116  //
   117  // If a span is not covered, the onMissing function is called with the span and
   118  // time missing to determine what error, if any, should be returned.
   119  func makeImportSpans(
   120  	tableSpans []roachpb.Span,
   121  	backups []BackupManifest,
   122  	backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo,
   123  	lowWaterMark roachpb.Key,
   124  	onMissing func(span covering.Range, start, end hlc.Timestamp) error,
   125  ) ([]importEntry, hlc.Timestamp, error) {
   126  	// Put the covering for the already-completed spans into the
   127  	// OverlapCoveringMerge input first. Payloads are returned in the same order
   128  	// that they appear in the input; putting the completedSpan first means we'll
   129  	// see it first when iterating over the output of OverlapCoveringMerge and
   130  	// avoid doing unnecessary work.
   131  	completedCovering := covering.Covering{
   132  		{
   133  			Start:   []byte(keys.MinKey),
   134  			End:     []byte(lowWaterMark),
   135  			Payload: importEntry{entryType: completedSpan},
   136  		},
   137  	}
   138  
   139  	// Put the merged table data covering into the OverlapCoveringMerge input
   140  	// next.
   141  	var tableSpanCovering covering.Covering
   142  	for _, span := range tableSpans {
   143  		tableSpanCovering = append(tableSpanCovering, covering.Range{
   144  			Start: span.Key,
   145  			End:   span.EndKey,
   146  			Payload: importEntry{
   147  				Span:      span,
   148  				entryType: tableSpan,
   149  			},
   150  		})
   151  	}
   152  
   153  	backupCoverings := []covering.Covering{completedCovering, tableSpanCovering}
   154  
   155  	// Iterate over backups creating two coverings for each. First the spans
   156  	// that were backed up, then the files in the backup. The latter is a subset
   157  	// when some of the keyranges in the former didn't change since the previous
   158  	// backup. These alternate (backup1 spans, backup1 files, backup2 spans,
   159  	// backup2 files) so they will retain that alternation in the output of
   160  	// OverlapCoveringMerge.
   161  	var maxEndTime hlc.Timestamp
   162  	for i, b := range backups {
   163  		if maxEndTime.Less(b.EndTime) {
   164  			maxEndTime = b.EndTime
   165  		}
   166  
   167  		var backupNewSpanCovering covering.Covering
   168  		for _, s := range b.IntroducedSpans {
   169  			backupNewSpanCovering = append(backupNewSpanCovering, covering.Range{
   170  				Start:   s.Key,
   171  				End:     s.EndKey,
   172  				Payload: importEntry{Span: s, entryType: backupSpan, start: hlc.Timestamp{}, end: b.StartTime},
   173  			})
   174  		}
   175  		backupCoverings = append(backupCoverings, backupNewSpanCovering)
   176  
   177  		var backupSpanCovering covering.Covering
   178  		for _, s := range b.Spans {
   179  			backupSpanCovering = append(backupSpanCovering, covering.Range{
   180  				Start:   s.Key,
   181  				End:     s.EndKey,
   182  				Payload: importEntry{Span: s, entryType: backupSpan, start: b.StartTime, end: b.EndTime},
   183  			})
   184  		}
   185  		backupCoverings = append(backupCoverings, backupSpanCovering)
   186  		var backupFileCovering covering.Covering
   187  
   188  		var storesByLocalityKV map[string]roachpb.ExternalStorage
   189  		if backupLocalityInfo != nil && backupLocalityInfo[i].URIsByOriginalLocalityKV != nil {
   190  			storesByLocalityKV = make(map[string]roachpb.ExternalStorage)
   191  			for kv, uri := range backupLocalityInfo[i].URIsByOriginalLocalityKV {
   192  				conf, err := cloud.ExternalStorageConfFromURI(uri)
   193  				if err != nil {
   194  					return nil, hlc.Timestamp{}, err
   195  				}
   196  				storesByLocalityKV[kv] = conf
   197  			}
   198  		}
   199  		for _, f := range b.Files {
   200  			dir := b.Dir
   201  			if storesByLocalityKV != nil {
   202  				if newDir, ok := storesByLocalityKV[f.LocalityKV]; ok {
   203  					dir = newDir
   204  				}
   205  			}
   206  			backupFileCovering = append(backupFileCovering, covering.Range{
   207  				Start: f.Span.Key,
   208  				End:   f.Span.EndKey,
   209  				Payload: importEntry{
   210  					Span:      f.Span,
   211  					entryType: backupFile,
   212  					dir:       dir,
   213  					file:      f,
   214  				},
   215  			})
   216  		}
   217  		backupCoverings = append(backupCoverings, backupFileCovering)
   218  	}
   219  
   220  	// Group ranges covered by backups with ones needed to restore the selected
   221  	// tables. Note that this breaks intervals up as necessary to align them.
   222  	// See the function godoc for details.
   223  	importRanges := covering.OverlapCoveringMerge(backupCoverings)
   224  
   225  	// Translate the output of OverlapCoveringMerge into requests.
   226  	var requestEntries []importEntry
   227  rangeLoop:
   228  	for _, importRange := range importRanges {
   229  		needed := false
   230  		var ts hlc.Timestamp
   231  		var files []roachpb.ImportRequest_File
   232  		payloads := importRange.Payload.([]interface{})
   233  		for _, p := range payloads {
   234  			ie := p.(importEntry)
   235  			switch ie.entryType {
   236  			case completedSpan:
   237  				continue rangeLoop
   238  			case tableSpan:
   239  				needed = true
   240  			case backupSpan:
   241  				if ts != ie.start {
   242  					return nil, hlc.Timestamp{}, errors.Errorf(
   243  						"no backup covers time [%s,%s) for range [%s,%s) or backups listed out of order (mismatched start time)",
   244  						ts, ie.start,
   245  						roachpb.Key(importRange.Start), roachpb.Key(importRange.End))
   246  				}
   247  				ts = ie.end
   248  			case backupFile:
   249  				if len(ie.file.Path) > 0 {
   250  					files = append(files, roachpb.ImportRequest_File{
   251  						Dir:    ie.dir,
   252  						Path:   ie.file.Path,
   253  						Sha512: ie.file.Sha512,
   254  					})
   255  				}
   256  			}
   257  		}
   258  		if needed {
   259  			if ts != maxEndTime {
   260  				if err := onMissing(importRange, ts, maxEndTime); err != nil {
   261  					return nil, hlc.Timestamp{}, err
   262  				}
   263  			}
   264  			// If needed is false, we have data backed up that is not necessary
   265  			// for this restore. Skip it.
   266  			requestEntries = append(requestEntries, importEntry{
   267  				Span:      roachpb.Span{Key: importRange.Start, EndKey: importRange.End},
   268  				entryType: request,
   269  				files:     files,
   270  			})
   271  		}
   272  	}
   273  	return requestEntries, maxEndTime, nil
   274  }
   275  
   276  // splitAndScatter creates new ranges for importSpans and scatters replicas and
   277  // leaseholders to be as evenly balanced as possible. It does this with some
   278  // amount of parallelism but also staying as close to the order in importSpans
   279  // as possible (the more out of order, the more work is done if a RESTORE job
   280  // loses its lease and has to be restarted).
   281  //
   282  // At a high level, this is accomplished by splitting and scattering large
   283  // "chunks" from the front of importEntries in one goroutine, each of which are
   284  // in turn passed to one of many worker goroutines that split and scatter the
   285  // individual entries.
   286  //
   287  // importEntries are sent to readyForImportCh as they are scattered, so letting
   288  // that channel send block can be used for backpressure on the splits and
   289  // scatters.
   290  //
   291  // TODO(dan): This logic is largely tuned by running BenchmarkRestore2TB. See if
   292  // there's some way to test it without running an O(hour) long benchmark.
   293  func splitAndScatter(
   294  	restoreCtx context.Context,
   295  	settings *cluster.Settings,
   296  	db *kv.DB,
   297  	kr *storageccl.KeyRewriter,
   298  	numClusterNodes int,
   299  	importSpans []importEntry,
   300  	readyForImportCh chan<- importEntry,
   301  ) error {
   302  	var span opentracing.Span
   303  	ctx, span := tracing.ChildSpan(restoreCtx, "presplit-scatter")
   304  	defer tracing.FinishSpan(span)
   305  
   306  	g := ctxgroup.WithContext(ctx)
   307  
   308  	// TODO(dan): This not super principled. I just wanted something that wasn't
   309  	// a constant and grew slower than linear with the length of importSpans. It
   310  	// seems to be working well for BenchmarkRestore2TB but worth revisiting.
   311  	chunkSize := int(math.Sqrt(float64(len(importSpans))))
   312  	importSpanChunks := make([][]importEntry, 0, len(importSpans)/chunkSize)
   313  	for start := 0; start < len(importSpans); {
   314  		importSpanChunk := importSpans[start:]
   315  		end := start + chunkSize
   316  		if end < len(importSpans) {
   317  			importSpanChunk = importSpans[start:end]
   318  		}
   319  		importSpanChunks = append(importSpanChunks, importSpanChunk)
   320  		start = end
   321  	}
   322  
   323  	importSpanChunksCh := make(chan []importEntry)
   324  	expirationTime := db.Clock().Now().Add(time.Hour.Nanoseconds(), 0)
   325  	g.GoCtx(func(ctx context.Context) error {
   326  		defer close(importSpanChunksCh)
   327  		for idx, importSpanChunk := range importSpanChunks {
   328  			// TODO(dan): The structure between this and the below are very
   329  			// similar. Dedup.
   330  			chunkKey, err := rewriteBackupSpanKey(kr, importSpanChunk[0].Key)
   331  			if err != nil {
   332  				return err
   333  			}
   334  
   335  			// TODO(dan): Really, this should be splitting the Key of the first
   336  			// entry in the _next_ chunk.
   337  			log.VEventf(restoreCtx, 1, "presplitting chunk %d of %d", idx, len(importSpanChunks))
   338  			if err := db.AdminSplit(ctx, chunkKey, chunkKey, expirationTime); err != nil {
   339  				return err
   340  			}
   341  
   342  			log.VEventf(restoreCtx, 1, "scattering chunk %d of %d", idx, len(importSpanChunks))
   343  			scatterReq := &roachpb.AdminScatterRequest{
   344  				RequestHeader: roachpb.RequestHeaderFromSpan(roachpb.Span{
   345  					Key:    chunkKey,
   346  					EndKey: chunkKey.Next(),
   347  				}),
   348  				// TODO(dan): This is a bit of a hack, but it seems to be an effective
   349  				// one (see the PR that added it for graphs). As of the commit that
   350  				// added this, scatter is not very good at actually balancing leases.
   351  				// This is likely for two reasons: 1) there's almost certainly some
   352  				// regression in scatter's behavior, it used to work much better and 2)
   353  				// scatter has to operate by balancing leases for all ranges in a
   354  				// cluster, but in RESTORE, we really just want it to be balancing the
   355  				// span being restored into.
   356  				RandomizeLeases: true,
   357  			}
   358  			if _, pErr := kv.SendWrapped(ctx, db.NonTransactionalSender(), scatterReq); pErr != nil {
   359  				// TODO(dan): Unfortunately, Scatter is still too unreliable to
   360  				// fail the RESTORE when Scatter fails. I'm uncomfortable that
   361  				// this could break entirely and not start failing the tests,
   362  				// but on the bright side, it doesn't affect correctness, only
   363  				// throughput.
   364  				log.Errorf(ctx, "failed to scatter chunk %d: %s", idx, pErr.GoError())
   365  			}
   366  
   367  			select {
   368  			case <-ctx.Done():
   369  				return ctx.Err()
   370  			case importSpanChunksCh <- importSpanChunk:
   371  			}
   372  		}
   373  		return nil
   374  	})
   375  
   376  	// TODO(dan): This tries to cover for a bad scatter by having 2 * the number
   377  	// of nodes in the cluster. Is it necessary?
   378  	splitScatterWorkers := numClusterNodes * 2
   379  	var splitScatterStarted uint64 // Only access using atomic.
   380  	for worker := 0; worker < splitScatterWorkers; worker++ {
   381  		g.GoCtx(func(ctx context.Context) error {
   382  			for importSpanChunk := range importSpanChunksCh {
   383  				for _, importSpan := range importSpanChunk {
   384  					idx := atomic.AddUint64(&splitScatterStarted, 1)
   385  
   386  					newSpanKey, err := rewriteBackupSpanKey(kr, importSpan.Span.Key)
   387  					if err != nil {
   388  						return err
   389  					}
   390  
   391  					// TODO(dan): Really, this should be splitting the Key of
   392  					// the _next_ entry.
   393  					log.VEventf(restoreCtx, 1, "presplitting %d of %d", idx, len(importSpans))
   394  					if err := db.AdminSplit(ctx, newSpanKey, newSpanKey, expirationTime); err != nil {
   395  						return err
   396  					}
   397  
   398  					log.VEventf(restoreCtx, 1, "scattering %d of %d", idx, len(importSpans))
   399  					scatterReq := &roachpb.AdminScatterRequest{
   400  						RequestHeader: roachpb.RequestHeaderFromSpan(roachpb.Span{Key: newSpanKey, EndKey: newSpanKey.Next()}),
   401  					}
   402  					if _, pErr := kv.SendWrapped(ctx, db.NonTransactionalSender(), scatterReq); pErr != nil {
   403  						// TODO(dan): Unfortunately, Scatter is still too unreliable to
   404  						// fail the RESTORE when Scatter fails. I'm uncomfortable that
   405  						// this could break entirely and not start failing the tests,
   406  						// but on the bright side, it doesn't affect correctness, only
   407  						// throughput.
   408  						log.Errorf(ctx, "failed to scatter %d: %s", idx, pErr.GoError())
   409  					}
   410  
   411  					select {
   412  					case <-ctx.Done():
   413  						return ctx.Err()
   414  					case readyForImportCh <- importSpan:
   415  					}
   416  				}
   417  			}
   418  			return nil
   419  		})
   420  	}
   421  
   422  	return g.Wait()
   423  }
   424  
   425  // WriteTableDescs writes all the the new descriptors: First the ID ->
   426  // TableDescriptor for the new table, then flip (or initialize) the name -> ID
   427  // entry so any new queries will use the new one. The tables are assigned the
   428  // permissions of their parent database and the user must have CREATE permission
   429  // on that database at the time this function is called.
   430  func WriteTableDescs(
   431  	ctx context.Context,
   432  	txn *kv.Txn,
   433  	databases []*sqlbase.DatabaseDescriptor,
   434  	tables []*sqlbase.TableDescriptor,
   435  	descCoverage tree.DescriptorCoverage,
   436  	settings *cluster.Settings,
   437  	extra []roachpb.KeyValue,
   438  ) error {
   439  	ctx, span := tracing.ChildSpan(ctx, "WriteTableDescs")
   440  	defer tracing.FinishSpan(span)
   441  	err := func() error {
   442  		b := txn.NewBatch()
   443  		wroteDBs := make(map[sqlbase.ID]*sqlbase.DatabaseDescriptor)
   444  		for _, desc := range databases {
   445  			// If the restore is not a full cluster restore we cannot know that
   446  			// the users on the restoring cluster match the ones that were on the
   447  			// cluster that was backed up. So we wipe the priviledges on the database.
   448  			if descCoverage != tree.AllDescriptors {
   449  				desc.Privileges = sqlbase.NewDefaultPrivilegeDescriptor()
   450  			}
   451  			wroteDBs[desc.ID] = desc
   452  			if err := catalogkv.WriteNewDescToBatch(ctx, false /* kvTrace */, settings, b, keys.SystemSQLCodec, desc.ID, desc); err != nil {
   453  				return err
   454  			}
   455  			// Depending on which cluster version we are restoring to, we decide which
   456  			// namespace table to write the descriptor into. This may cause wrong
   457  			// behavior if the cluster version is bumped DURING a restore.
   458  			dKey := sqlbase.MakeDatabaseNameKey(ctx, settings, desc.Name)
   459  			b.CPut(dKey.Key(keys.SystemSQLCodec), desc.ID, nil)
   460  		}
   461  		for i := range tables {
   462  			// For full cluster restore, keep privileges as they were.
   463  			if wrote, ok := wroteDBs[tables[i].ParentID]; ok {
   464  				// Leave the privileges of the temp system tables as
   465  				// the default.
   466  				if descCoverage != tree.AllDescriptors || wrote.Name == restoreTempSystemDB {
   467  					tables[i].Privileges = wrote.GetPrivileges()
   468  				}
   469  			} else {
   470  				parentDB, err := sqlbase.GetDatabaseDescFromID(ctx, txn, keys.SystemSQLCodec, tables[i].ParentID)
   471  				if err != nil {
   472  					return errors.Wrapf(err,
   473  						"failed to lookup parent DB %d", errors.Safe(tables[i].ParentID))
   474  				}
   475  				// We don't check priv's here since we checked them during job planning.
   476  
   477  				// On full cluster restore, keep the privs as they are in the backup.
   478  				if descCoverage != tree.AllDescriptors {
   479  					// Default is to copy privs from restoring parent db, like CREATE TABLE.
   480  					// TODO(dt): Make this more configurable.
   481  					tables[i].Privileges = parentDB.GetPrivileges()
   482  				}
   483  			}
   484  			if err := catalogkv.WriteNewDescToBatch(ctx, false /* kvTrace */, settings, b, keys.SystemSQLCodec, tables[i].ID, tables[i]); err != nil {
   485  				return err
   486  			}
   487  			// Depending on which cluster version we are restoring to, we decide which
   488  			// namespace table to write the descriptor into. This may cause wrong
   489  			// behavior if the cluster version is bumped DURING a restore.
   490  			tkey := sqlbase.MakePublicTableNameKey(ctx, settings, tables[i].ParentID, tables[i].Name)
   491  			b.CPut(tkey.Key(keys.SystemSQLCodec), tables[i].ID, nil)
   492  		}
   493  		for _, kv := range extra {
   494  			b.InitPut(kv.Key, &kv.Value, false)
   495  		}
   496  		if err := txn.Run(ctx, b); err != nil {
   497  			if errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) {
   498  				return pgerror.Newf(pgcode.DuplicateObject, "table already exists")
   499  			}
   500  			return err
   501  		}
   502  
   503  		for _, table := range tables {
   504  			if err := table.Validate(ctx, txn, keys.SystemSQLCodec); err != nil {
   505  				return errors.Wrapf(err,
   506  					"validate table %d", errors.Safe(table.ID))
   507  			}
   508  		}
   509  		return nil
   510  	}()
   511  	return errors.Wrapf(err, "restoring table desc and namespace entries")
   512  }
   513  
   514  // rewriteBackupSpanKey rewrites a backup span start key for the purposes of
   515  // splitting up the target key-space to send out the actual work of restoring.
   516  //
   517  // Keys for the primary index of the top-level table are rewritten to the just
   518  // the overall start of the table. That is, /Table/51/1 becomes /Table/51.
   519  //
   520  // Any suffix of the key that does is not rewritten by kr's configured rewrites
   521  // is truncated. For instance if a passed span has key /Table/51/1/77#/53/2/1
   522  // but kr only configured with a rewrite for 51, it would return /Table/51/1/77.
   523  // Such span boundaries are usually due to a interleaved table which has since
   524  // been dropped -- any splits that happened to pick one of its rows live on, but
   525  // include an ID of a table that no longer exists.
   526  //
   527  // Note that the actual restore process (i.e. inside ImportRequest) does not use
   528  // these keys -- they are only used to split the key space and distribute those
   529  // requests, thus truncation is fine. In the rare case where multiple backup
   530  // spans are truncated to the same prefix (i.e. entire spans resided under the
   531  // same interleave parent row) we'll generate some no-op splits and route the
   532  // work to the same range, but the actual imported data is unaffected.
   533  func rewriteBackupSpanKey(kr *storageccl.KeyRewriter, key roachpb.Key) (roachpb.Key, error) {
   534  	newKey, rewritten, err := kr.RewriteKey(append([]byte(nil), key...), true /* isFromSpan */)
   535  	if err != nil {
   536  		return nil, errors.NewAssertionErrorWithWrappedErrf(err,
   537  			"could not rewrite span start key: %s", key)
   538  	}
   539  	if !rewritten && bytes.Equal(newKey, key) {
   540  		// if nothing was changed, we didn't match the top-level key at all.
   541  		return nil, errors.AssertionFailedf(
   542  			"no rewrite for span start key: %s", key)
   543  	}
   544  	// Modify all spans that begin at the primary index to instead begin at the
   545  	// start of the table. That is, change a span start key from /Table/51/1 to
   546  	// /Table/51. Otherwise a permanently empty span at /Table/51-/Table/51/1
   547  	// will be created.
   548  	if b, id, idx, err := keys.TODOSQLCodec.DecodeIndexPrefix(newKey); err != nil {
   549  		return nil, errors.NewAssertionErrorWithWrappedErrf(err,
   550  			"could not rewrite span start key: %s", key)
   551  	} else if idx == 1 && len(b) == 0 {
   552  		newKey = keys.TODOSQLCodec.TablePrefix(id)
   553  	}
   554  	return newKey, nil
   555  }
   556  
   557  // restore imports a SQL table (or tables) from sets of non-overlapping sstable
   558  // files.
   559  func restore(
   560  	restoreCtx context.Context,
   561  	db *kv.DB,
   562  	numClusterNodes int,
   563  	settings *cluster.Settings,
   564  	backupManifests []BackupManifest,
   565  	backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo,
   566  	endTime hlc.Timestamp,
   567  	tables []*sqlbase.TableDescriptor,
   568  	oldTableIDs []sqlbase.ID,
   569  	spans []roachpb.Span,
   570  	job *jobs.Job,
   571  	encryption *roachpb.FileEncryptionOptions,
   572  ) (RowCount, error) {
   573  	// A note about contexts and spans in this method: the top-level context
   574  	// `restoreCtx` is used for orchestration logging. All operations that carry
   575  	// out work get their individual contexts.
   576  
   577  	mu := struct {
   578  		syncutil.Mutex
   579  		res               RowCount
   580  		requestsCompleted []bool
   581  		highWaterMark     int
   582  	}{
   583  		highWaterMark: -1,
   584  	}
   585  
   586  	// Get TableRekeys to use when importing raw data.
   587  	var rekeys []roachpb.ImportRequest_TableRekey
   588  	for i := range tables {
   589  		tableToSerialize := tables[i]
   590  		newDescBytes, err := protoutil.Marshal(sqlbase.WrapDescriptor(tableToSerialize))
   591  		if err != nil {
   592  			return mu.res, errors.NewAssertionErrorWithWrappedErrf(err,
   593  				"marshaling descriptor")
   594  		}
   595  		rekeys = append(rekeys, roachpb.ImportRequest_TableRekey{
   596  			OldID:   uint32(oldTableIDs[i]),
   597  			NewDesc: newDescBytes,
   598  		})
   599  	}
   600  	kr, err := storageccl.MakeKeyRewriterFromRekeys(rekeys)
   601  	if err != nil {
   602  		return mu.res, err
   603  	}
   604  
   605  	// Pivot the backups, which are grouped by time, into requests for import,
   606  	// which are grouped by keyrange.
   607  	highWaterMark := job.Progress().Details.(*jobspb.Progress_Restore).Restore.HighWater
   608  	importSpans, _, err := makeImportSpans(spans, backupManifests, backupLocalityInfo, highWaterMark, errOnMissingRange)
   609  	if err != nil {
   610  		return mu.res, errors.Wrapf(err, "making import requests for %d backups", len(backupManifests))
   611  	}
   612  
   613  	for i := range importSpans {
   614  		importSpans[i].progressIdx = i
   615  	}
   616  	mu.requestsCompleted = make([]bool, len(importSpans))
   617  
   618  	progressLogger := jobs.NewChunkProgressLogger(job, len(importSpans), job.FractionCompleted(),
   619  		func(progressedCtx context.Context, details jobspb.ProgressDetails) {
   620  			switch d := details.(type) {
   621  			case *jobspb.Progress_Restore:
   622  				mu.Lock()
   623  				if mu.highWaterMark >= 0 {
   624  					d.Restore.HighWater = importSpans[mu.highWaterMark].Key
   625  				}
   626  				mu.Unlock()
   627  			default:
   628  				log.Errorf(progressedCtx, "job payload had unexpected type %T", d)
   629  			}
   630  		})
   631  
   632  	pkIDs := make(map[uint64]struct{})
   633  	for _, tbl := range tables {
   634  		pkIDs[roachpb.BulkOpSummaryID(uint64(tbl.ID), uint64(tbl.PrimaryIndex.ID))] = struct{}{}
   635  	}
   636  
   637  	// We're already limiting these on the server-side, but sending all the
   638  	// Import requests at once would fill up distsender/grpc/something and cause
   639  	// all sorts of badness (node liveness timeouts leading to mass leaseholder
   640  	// transfers, poor performance on SQL workloads, etc) as well as log spam
   641  	// about slow distsender requests. Rate limit them here, too.
   642  	//
   643  	// Use the number of cpus across all nodes in the cluster as the number of
   644  	// outstanding Import requests for the rate limiting. Note that this assumes
   645  	// all nodes in the cluster have the same number of cpus, but it's okay if
   646  	// that's wrong.
   647  	//
   648  	// TODO(dan): Make this limiting per node.
   649  	maxConcurrentImports := numClusterNodes * runtime.NumCPU()
   650  	importsSem := make(chan struct{}, maxConcurrentImports)
   651  
   652  	g := ctxgroup.WithContext(restoreCtx)
   653  
   654  	// The Import (and resulting AddSSTable) requests made below run on
   655  	// leaseholders, so presplit and scatter the ranges to balance the work
   656  	// among many nodes.
   657  	//
   658  	// We're about to start off some goroutines that presplit & scatter each
   659  	// import span. Once split and scattered, the span is submitted to
   660  	// readyForImportCh to indicate it's ready for Import. Since import is so
   661  	// much slower, we buffer the channel to keep the split/scatter work from
   662  	// getting too far ahead. This both naturally rate limits the split/scatters
   663  	// and bounds the number of empty ranges created if the RESTORE fails (or is
   664  	// canceled).
   665  	const presplitLeadLimit = 10
   666  	readyForImportCh := make(chan importEntry, presplitLeadLimit)
   667  	g.GoCtx(func(ctx context.Context) error {
   668  		defer close(readyForImportCh)
   669  		return splitAndScatter(ctx, settings, db, kr, numClusterNodes, importSpans, readyForImportCh)
   670  	})
   671  
   672  	requestFinishedCh := make(chan struct{}, len(importSpans)) // enough buffer to never block
   673  	g.GoCtx(func(ctx context.Context) error {
   674  		ctx, progressSpan := tracing.ChildSpan(ctx, "progress-log")
   675  		defer tracing.FinishSpan(progressSpan)
   676  		return progressLogger.Loop(ctx, requestFinishedCh)
   677  	})
   678  	g.GoCtx(func(ctx context.Context) error {
   679  		log.Eventf(restoreCtx, "commencing import of data with concurrency %d", maxConcurrentImports)
   680  		for readyForImportSpan := range readyForImportCh {
   681  			newSpanKey, err := rewriteBackupSpanKey(kr, readyForImportSpan.Span.Key)
   682  			if err != nil {
   683  				return err
   684  			}
   685  			idx := readyForImportSpan.progressIdx
   686  
   687  			importRequest := &roachpb.ImportRequest{
   688  				// Import is a point request because we don't want DistSender to split
   689  				// it. Assume (but don't require) the entire post-rewrite span is on the
   690  				// same range.
   691  				RequestHeader: roachpb.RequestHeader{Key: newSpanKey},
   692  				DataSpan:      readyForImportSpan.Span,
   693  				Files:         readyForImportSpan.files,
   694  				EndTime:       endTime,
   695  				Rekeys:        rekeys,
   696  				Encryption:    encryption,
   697  			}
   698  
   699  			log.VEventf(restoreCtx, 1, "importing %d of %d", idx, len(importSpans))
   700  
   701  			select {
   702  			case importsSem <- struct{}{}:
   703  			case <-ctx.Done():
   704  				return ctx.Err()
   705  			}
   706  
   707  			g.GoCtx(func(ctx context.Context) error {
   708  				ctx, importSpan := tracing.ChildSpan(ctx, "import")
   709  				log.Event(ctx, "acquired semaphore")
   710  				defer tracing.FinishSpan(importSpan)
   711  				defer func() { <-importsSem }()
   712  
   713  				importRes, pErr := kv.SendWrapped(ctx, db.NonTransactionalSender(), importRequest)
   714  				if pErr != nil {
   715  					return errors.Wrapf(pErr.GoError(), "importing span %v", importRequest.DataSpan)
   716  
   717  				}
   718  
   719  				mu.Lock()
   720  				mu.res.add(countRows(importRes.(*roachpb.ImportResponse).Imported, pkIDs))
   721  
   722  				// Assert that we're actually marking the correct span done. See #23977.
   723  				if !importSpans[idx].Key.Equal(importRequest.DataSpan.Key) {
   724  					mu.Unlock()
   725  					return errors.Newf("request %d for span %v (to %v) does not match import span for same idx: %v",
   726  						idx, importRequest.DataSpan, newSpanKey, importSpans[idx],
   727  					)
   728  				}
   729  				mu.requestsCompleted[idx] = true
   730  				for j := mu.highWaterMark + 1; j < len(mu.requestsCompleted) && mu.requestsCompleted[j]; j++ {
   731  					mu.highWaterMark = j
   732  				}
   733  				mu.Unlock()
   734  
   735  				requestFinishedCh <- struct{}{}
   736  				return nil
   737  			})
   738  		}
   739  		log.Event(restoreCtx, "wait for outstanding imports to finish")
   740  		return nil
   741  	})
   742  
   743  	if err := g.Wait(); err != nil {
   744  		// This leaves the data that did get imported in case the user wants to
   745  		// retry.
   746  		// TODO(dan): Build tooling to allow a user to restart a failed restore.
   747  		return mu.res, errors.Wrapf(err, "importing %d ranges", len(importSpans))
   748  	}
   749  
   750  	return mu.res, nil
   751  }
   752  
   753  // loadBackupSQLDescs extracts the backup descriptors, the latest backup
   754  // descriptor, and all the Descriptors for a backup to be restored. It upgrades
   755  // the table descriptors to the new FK representation if necessary. FKs that
   756  // can't be restored because the necessary tables are missing are omitted; if
   757  // skip_missing_foreign_keys was set, we should have aborted the RESTORE and
   758  // returned an error prior to this.
   759  func loadBackupSQLDescs(
   760  	ctx context.Context,
   761  	p sql.PlanHookState,
   762  	details jobspb.RestoreDetails,
   763  	encryption *roachpb.FileEncryptionOptions,
   764  ) ([]BackupManifest, BackupManifest, []sqlbase.Descriptor, error) {
   765  	backupManifests, err := loadBackupManifests(ctx, details.URIs, p.ExecCfg().DistSQLSrv.ExternalStorageFromURI, encryption)
   766  	if err != nil {
   767  		return nil, BackupManifest{}, nil, err
   768  	}
   769  
   770  	// Upgrade the table descriptors to use the new FK representation.
   771  	// TODO(lucy, jordan): This should become unnecessary in 20.1 when we stop
   772  	// writing old-style descs in RestoreDetails (unless a job persists across
   773  	// an upgrade?).
   774  	if err := maybeUpgradeTableDescsInBackupManifests(ctx, backupManifests, p.ExecCfg().Codec, true /* skipFKsWithNoMatchingTable */); err != nil {
   775  		return nil, BackupManifest{}, nil, err
   776  	}
   777  
   778  	allDescs, latestBackupManifest := loadSQLDescsFromBackupsAtTime(backupManifests, details.EndTime)
   779  
   780  	var sqlDescs []sqlbase.Descriptor
   781  	for _, desc := range allDescs {
   782  		if _, ok := details.TableRewrites[desc.GetID()]; ok {
   783  			sqlDescs = append(sqlDescs, desc)
   784  		}
   785  	}
   786  	return backupManifests, latestBackupManifest, sqlDescs, nil
   787  }
   788  
   789  type restoreResumer struct {
   790  	job                *jobs.Job
   791  	settings           *cluster.Settings
   792  	databases          []*sqlbase.DatabaseDescriptor
   793  	tables             []*sqlbase.TableDescriptor
   794  	descriptorCoverage tree.DescriptorCoverage
   795  	latestStats        []*stats.TableStatisticProto
   796  	execCfg            *sql.ExecutorConfig
   797  }
   798  
   799  // remapRelevantStatistics changes the table ID references in the stats
   800  // from those they had in the backed up database to what they should be
   801  // in the restored database.
   802  // It also selects only the statistics which belong to one of the tables
   803  // being restored. If the tableRewrites can re-write the table ID, then that
   804  // table is being restored.
   805  func remapRelevantStatistics(
   806  	backup BackupManifest, tableRewrites TableRewriteMap,
   807  ) []*stats.TableStatisticProto {
   808  	relevantTableStatistics := make([]*stats.TableStatisticProto, 0, len(backup.Statistics))
   809  
   810  	for i := range backup.Statistics {
   811  		stat := backup.Statistics[i]
   812  		tableRewrite, ok := tableRewrites[stat.TableID]
   813  		if !ok {
   814  			// Table re-write not present, so statistic should not be imported.
   815  			continue
   816  		}
   817  		stat.TableID = tableRewrite.TableID
   818  		relevantTableStatistics = append(relevantTableStatistics, stat)
   819  	}
   820  
   821  	return relevantTableStatistics
   822  }
   823  
   824  // isDatabaseEmpty checks if there exists any tables in the given database.
   825  // It pretends that the `ignoredTables` do not exist for the purposes of
   826  // checking if a database is empty.
   827  //
   828  // It is used to construct a transaction which deletes a set of tables as well
   829  // as some empty databases. However, we want to check that the databases are
   830  // empty _after_ the transaction would have completed, so we want to ignore
   831  // the tables that we're deleting in the same transaction. It is done this way
   832  // to avoid having 2 transactions reading and writing the same keys one right
   833  // after the other.
   834  func isDatabaseEmpty(
   835  	ctx context.Context,
   836  	db *kv.DB,
   837  	dbDesc *sql.DatabaseDescriptor,
   838  	ignoredTables map[sqlbase.ID]struct{},
   839  ) (bool, error) {
   840  	var allDescs []sqlbase.Descriptor
   841  	if err := db.Txn(
   842  		ctx,
   843  		func(ctx context.Context, txn *kv.Txn) error {
   844  			var err error
   845  			allDescs, err = allSQLDescriptors(ctx, txn)
   846  			return err
   847  		}); err != nil {
   848  		return false, err
   849  	}
   850  
   851  	for _, desc := range allDescs {
   852  		if t := desc.Table(hlc.Timestamp{}); t != nil {
   853  			if _, ok := ignoredTables[t.GetID()]; ok {
   854  				continue
   855  			}
   856  			if t.GetParentID() == dbDesc.ID {
   857  				return false, nil
   858  			}
   859  		}
   860  	}
   861  	return true, nil
   862  }
   863  
   864  // createImportingTables create the tables that we will restore into. It also
   865  // fetches the information from the old tables that we need for the restore.
   866  func createImportingTables(
   867  	ctx context.Context, p sql.PlanHookState, sqlDescs []sqlbase.Descriptor, r *restoreResumer,
   868  ) (
   869  	[]*sqlbase.DatabaseDescriptor,
   870  	[]*sqlbase.TableDescriptor,
   871  	[]sqlbase.ID,
   872  	[]roachpb.Span,
   873  	error,
   874  ) {
   875  	details := r.job.Details().(jobspb.RestoreDetails)
   876  
   877  	var databases []*sqlbase.DatabaseDescriptor
   878  	var tables []*sqlbase.TableDescriptor
   879  	var oldTableIDs []sqlbase.ID
   880  	for _, desc := range sqlDescs {
   881  		if tableDesc := desc.Table(hlc.Timestamp{}); tableDesc != nil {
   882  			tables = append(tables, tableDesc)
   883  			oldTableIDs = append(oldTableIDs, tableDesc.ID)
   884  		}
   885  		if dbDesc := desc.GetDatabase(); dbDesc != nil {
   886  			if rewrite, ok := details.TableRewrites[dbDesc.ID]; ok {
   887  				dbDesc.ID = rewrite.TableID
   888  				databases = append(databases, dbDesc)
   889  			}
   890  		}
   891  	}
   892  	tempSystemDBID := keys.MinNonPredefinedUserDescID
   893  	for id := range details.TableRewrites {
   894  		if int(id) > tempSystemDBID {
   895  			tempSystemDBID = int(id)
   896  		}
   897  	}
   898  	if details.DescriptorCoverage == tree.AllDescriptors {
   899  		databases = append(databases, &sqlbase.DatabaseDescriptor{
   900  			ID:         sqlbase.ID(tempSystemDBID),
   901  			Name:       restoreTempSystemDB,
   902  			Privileges: sqlbase.NewDefaultPrivilegeDescriptor(),
   903  		})
   904  	}
   905  
   906  	// We get the spans of the restoring tables _as they appear in the backup_,
   907  	// that is, in the 'old' keyspace, before we reassign the table IDs.
   908  	spans := spansForAllTableIndexes(p.ExecCfg().Codec, tables, nil)
   909  
   910  	log.Eventf(ctx, "starting restore for %d tables", len(tables))
   911  
   912  	// Assign new IDs and privileges to the tables, and update all references to
   913  	// use the new IDs.
   914  	if err := RewriteTableDescs(tables, details.TableRewrites, details.OverrideDB); err != nil {
   915  		return nil, nil, nil, nil, err
   916  	}
   917  
   918  	for _, desc := range tables {
   919  		desc.Version++
   920  		desc.State = sqlbase.TableDescriptor_OFFLINE
   921  		desc.OfflineReason = "restoring"
   922  	}
   923  
   924  	if !details.PrepareCompleted {
   925  		err := p.ExecCfg().DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   926  			// Write the new TableDescriptors which are set in the OFFLINE state.
   927  			if err := WriteTableDescs(ctx, txn, databases, tables, details.DescriptorCoverage, r.settings, nil /* extra */); err != nil {
   928  				return errors.Wrapf(err, "restoring %d TableDescriptors from %d databases", len(r.tables), len(databases))
   929  			}
   930  
   931  			details.PrepareCompleted = true
   932  			details.TableDescs = tables
   933  
   934  			// Update the job once all descs have been prepared for ingestion.
   935  			err := r.job.WithTxn(txn).SetDetails(ctx, details)
   936  
   937  			return err
   938  		})
   939  		if err != nil {
   940  			return nil, nil, nil, nil, err
   941  		}
   942  	}
   943  
   944  	return databases, tables, oldTableIDs, spans, nil
   945  }
   946  
   947  // Resume is part of the jobs.Resumer interface.
   948  func (r *restoreResumer) Resume(
   949  	ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums,
   950  ) error {
   951  	details := r.job.Details().(jobspb.RestoreDetails)
   952  	p := phs.(sql.PlanHookState)
   953  
   954  	backupManifests, latestBackupManifest, sqlDescs, err := loadBackupSQLDescs(
   955  		ctx, p, details, details.Encryption,
   956  	)
   957  	if err != nil {
   958  		return err
   959  	}
   960  
   961  	databases, tables, oldTableIDs, spans, err := createImportingTables(ctx, p, sqlDescs, r)
   962  	if err != nil {
   963  		return err
   964  	}
   965  	r.tables = tables
   966  	r.descriptorCoverage = details.DescriptorCoverage
   967  	r.databases = databases
   968  	r.execCfg = p.ExecCfg()
   969  	r.latestStats = remapRelevantStatistics(latestBackupManifest, details.TableRewrites)
   970  
   971  	if len(r.tables) == 0 {
   972  		// We have no tables to restore (we are restoring an empty DB).
   973  		// Since we have already created any new databases that we needed,
   974  		// we can return without importing any data.
   975  		log.Warning(ctx, "no tables to restore")
   976  		return nil
   977  	}
   978  
   979  	numClusterNodes, err := clusterNodeCount(p.ExecCfg().Gossip)
   980  	if err != nil {
   981  		return err
   982  	}
   983  
   984  	res, err := restore(
   985  		ctx,
   986  		p.ExecCfg().DB,
   987  		numClusterNodes,
   988  		p.ExecCfg().Settings,
   989  		backupManifests,
   990  		details.BackupLocalityInfo,
   991  		details.EndTime,
   992  		tables,
   993  		oldTableIDs,
   994  		spans,
   995  		r.job,
   996  		details.Encryption,
   997  	)
   998  	if err != nil {
   999  		return err
  1000  	}
  1001  
  1002  	if err := r.insertStats(ctx); err != nil {
  1003  		return errors.Wrap(err, "inserting table statistics")
  1004  	}
  1005  
  1006  	if err := r.publishTables(ctx); err != nil {
  1007  		return err
  1008  	}
  1009  
  1010  	if r.descriptorCoverage == tree.AllDescriptors {
  1011  		if err := r.restoreSystemTables(ctx); err != nil {
  1012  			return err
  1013  		}
  1014  	}
  1015  
  1016  	resultsCh <- tree.Datums{
  1017  		tree.NewDInt(tree.DInt(*r.job.ID())),
  1018  		tree.NewDString(string(jobs.StatusSucceeded)),
  1019  		tree.NewDFloat(tree.DFloat(1.0)),
  1020  		tree.NewDInt(tree.DInt(res.Rows)),
  1021  		tree.NewDInt(tree.DInt(res.IndexEntries)),
  1022  		tree.NewDInt(tree.DInt(res.DataSize)),
  1023  	}
  1024  
  1025  	// Collect telemetry.
  1026  	{
  1027  		telemetry.Count("restore.total.succeeded")
  1028  		const mb = 1 << 20
  1029  		sizeMb := res.DataSize / mb
  1030  		sec := int64(timeutil.Since(timeutil.FromUnixMicros(r.job.Payload().StartedMicros)).Seconds())
  1031  		var mbps int64
  1032  		if sec > 0 {
  1033  			mbps = mb / sec
  1034  		}
  1035  		telemetry.CountBucketed("restore.duration-sec.succeeded", sec)
  1036  		telemetry.CountBucketed("restore.size-mb.full", sizeMb)
  1037  		telemetry.CountBucketed("restore.speed-mbps.total", mbps)
  1038  		telemetry.CountBucketed("restore.speed-mbps.per-node", mbps/int64(numClusterNodes))
  1039  		// Tiny restores may skew throughput numbers due to overhead.
  1040  		if sizeMb > 10 {
  1041  			telemetry.CountBucketed("restore.speed-mbps.over10mb", mbps)
  1042  			telemetry.CountBucketed("restore.speed-mbps.over10mb.per-node", mbps/int64(numClusterNodes))
  1043  		}
  1044  	}
  1045  	return nil
  1046  }
  1047  
  1048  // Insert stats re-inserts the table statistics stored in the backup manifest.
  1049  func (r *restoreResumer) insertStats(ctx context.Context) error {
  1050  	details := r.job.Details().(jobspb.RestoreDetails)
  1051  	if details.StatsInserted {
  1052  		return nil
  1053  	}
  1054  
  1055  	err := r.execCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1056  		if err := stats.InsertNewStats(ctx, r.execCfg.InternalExecutor, txn, r.latestStats); err != nil {
  1057  			return errors.Wrapf(err, "inserting stats from backup")
  1058  		}
  1059  		details.StatsInserted = true
  1060  		if err := r.job.WithTxn(txn).SetDetails(ctx, details); err != nil {
  1061  			return errors.Wrapf(err, "updating job marking stats insertion complete")
  1062  		}
  1063  		return nil
  1064  	})
  1065  	if err != nil {
  1066  		return err
  1067  	}
  1068  	return nil
  1069  }
  1070  
  1071  // publishTables updates the RESTORED tables status from OFFLINE to PUBLIC.
  1072  func (r *restoreResumer) publishTables(ctx context.Context) error {
  1073  	details := r.job.Details().(jobspb.RestoreDetails)
  1074  	if details.TablesPublished {
  1075  		return nil
  1076  	}
  1077  	log.Event(ctx, "making tables live")
  1078  
  1079  	newSchemaChangeJobs := make([]*jobs.StartableJob, 0)
  1080  	err := r.execCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1081  		// Write the new TableDescriptors and flip state over to public so they can be
  1082  		// accessed.
  1083  		b := txn.NewBatch()
  1084  		for _, tbl := range r.tables {
  1085  			tableDesc := *tbl
  1086  			tableDesc.Version++
  1087  			tableDesc.State = sqlbase.TableDescriptor_PUBLIC
  1088  			// Convert any mutations that were in progress on the table descriptor
  1089  			// when the backup was taken, and convert them to schema change jobs.
  1090  			newJobs, err := createSchemaChangeJobsFromMutations(ctx, r.execCfg.JobRegistry, r.execCfg.Codec, txn, r.job.Payload().Username, &tableDesc)
  1091  			if err != nil {
  1092  				return err
  1093  			}
  1094  			newSchemaChangeJobs = append(newSchemaChangeJobs, newJobs...)
  1095  			existingDescVal, err := sqlbase.ConditionalGetTableDescFromTxn(ctx, txn, r.execCfg.Codec, tbl)
  1096  			if err != nil {
  1097  				return errors.Wrap(err, "validating table descriptor has not changed")
  1098  			}
  1099  			b.CPut(
  1100  				sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, tableDesc.ID),
  1101  				sqlbase.WrapDescriptor(&tableDesc),
  1102  				existingDescVal,
  1103  			)
  1104  		}
  1105  
  1106  		if err := txn.Run(ctx, b); err != nil {
  1107  			return errors.Wrap(err, "publishing tables")
  1108  		}
  1109  
  1110  		// Update and persist the state of the job.
  1111  		details.TablesPublished = true
  1112  		if err := r.job.WithTxn(txn).SetDetails(ctx, details); err != nil {
  1113  			for _, newJob := range newSchemaChangeJobs {
  1114  				if cleanupErr := newJob.CleanupOnRollback(ctx); cleanupErr != nil {
  1115  					log.Warningf(ctx, "failed to clean up job %d: %v", newJob.ID(), cleanupErr)
  1116  				}
  1117  			}
  1118  			return errors.Wrap(err, "updating job details after publishing tables")
  1119  		}
  1120  
  1121  		return nil
  1122  	})
  1123  	if err != nil {
  1124  		return err
  1125  	}
  1126  
  1127  	// Start the schema change jobs we created.
  1128  	for _, newJob := range newSchemaChangeJobs {
  1129  		if _, err := newJob.Start(ctx); err != nil {
  1130  			return err
  1131  		}
  1132  	}
  1133  
  1134  	// Initiate a run of CREATE STATISTICS. We don't know the actual number of
  1135  	// rows affected per table, so we use a large number because we want to make
  1136  	// sure that stats always get created/refreshed here.
  1137  	for i := range r.tables {
  1138  		r.execCfg.StatsRefresher.NotifyMutation(r.tables[i].ID, math.MaxInt32 /* rowsAffected */)
  1139  	}
  1140  
  1141  	return nil
  1142  }
  1143  
  1144  // OnFailOrCancel is part of the jobs.Resumer interface. Removes KV data that
  1145  // has been committed from a restore that has failed or been canceled. It does
  1146  // this by adding the table descriptors in DROP state, which causes the schema
  1147  // change stuff to delete the keys in the background.
  1148  func (r *restoreResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error {
  1149  	telemetry.Count("restore.total.failed")
  1150  	telemetry.CountBucketed("restore.duration-sec.failed",
  1151  		int64(timeutil.Since(timeutil.FromUnixMicros(r.job.Payload().StartedMicros)).Seconds()))
  1152  
  1153  	execCfg := phs.(sql.PlanHookState).ExecCfg()
  1154  	return execCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1155  		return r.dropTables(ctx, execCfg.JobRegistry, txn)
  1156  	})
  1157  }
  1158  
  1159  // dropTables implements the OnFailOrCancel logic.
  1160  func (r *restoreResumer) dropTables(ctx context.Context, jr *jobs.Registry, txn *kv.Txn) error {
  1161  	details := r.job.Details().(jobspb.RestoreDetails)
  1162  
  1163  	// No need to mark the tables as dropped if they were not even created in the
  1164  	// first place.
  1165  	if !details.PrepareCompleted {
  1166  		return nil
  1167  	}
  1168  
  1169  	// Needed to trigger the schema change manager.
  1170  	if err := txn.SetSystemConfigTrigger(); err != nil {
  1171  		return err
  1172  	}
  1173  
  1174  	b := txn.NewBatch()
  1175  	// Drop the table descriptors that were created at the start of the restore.
  1176  	tablesToGC := make([]sqlbase.ID, 0, len(details.TableDescs))
  1177  	for _, tbl := range details.TableDescs {
  1178  		tablesToGC = append(tablesToGC, tbl.ID)
  1179  		tableDesc := *tbl
  1180  		tableDesc.Version++
  1181  		tableDesc.State = sqlbase.TableDescriptor_DROP
  1182  		err := sqlbase.RemovePublicTableNamespaceEntry(ctx, txn, keys.SystemSQLCodec, tbl.ParentID, tbl.Name)
  1183  		if err != nil {
  1184  			return errors.Wrap(err, "dropping tables caused by restore fail/cancel from public namespace")
  1185  		}
  1186  		existingDescVal, err := sqlbase.ConditionalGetTableDescFromTxn(ctx, txn, r.execCfg.Codec, tbl)
  1187  		if err != nil {
  1188  			return errors.Wrap(err, "dropping tables caused by restore fail/cancel")
  1189  		}
  1190  		b.CPut(
  1191  			sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, tableDesc.ID),
  1192  			sqlbase.WrapDescriptor(&tableDesc),
  1193  			existingDescVal,
  1194  		)
  1195  	}
  1196  
  1197  	// Queue a GC job.
  1198  	// Set the drop time as 1 (ns in Unix time), so that the table gets GC'd
  1199  	// immediately.
  1200  	dropTime := int64(1)
  1201  	gcDetails := jobspb.SchemaChangeGCDetails{}
  1202  	for _, tableID := range tablesToGC {
  1203  		gcDetails.Tables = append(gcDetails.Tables, jobspb.SchemaChangeGCDetails_DroppedID{
  1204  			ID:       tableID,
  1205  			DropTime: dropTime,
  1206  		})
  1207  	}
  1208  	gcJobRecord := jobs.Record{
  1209  		Description:   fmt.Sprintf("GC for %s", r.job.Payload().Description),
  1210  		Username:      r.job.Payload().Username,
  1211  		DescriptorIDs: tablesToGC,
  1212  		Details:       gcDetails,
  1213  		Progress:      jobspb.SchemaChangeGCProgress{},
  1214  		NonCancelable: true,
  1215  	}
  1216  	if _, err := jr.CreateJobWithTxn(ctx, gcJobRecord, txn); err != nil {
  1217  		return err
  1218  	}
  1219  
  1220  	// Drop the database descriptors that were created at the start of the
  1221  	// restore if they are now empty (i.e. no user created a table in this
  1222  	// database during the restore).
  1223  	var isDBEmpty bool
  1224  	var err error
  1225  	ignoredTables := make(map[sqlbase.ID]struct{})
  1226  	for _, table := range details.TableDescs {
  1227  		ignoredTables[table.ID] = struct{}{}
  1228  	}
  1229  	for _, dbDesc := range r.databases {
  1230  		// We need to ignore details.TableDescs since we haven't committed the txn that deletes these.
  1231  		isDBEmpty, err = isDatabaseEmpty(ctx, r.execCfg.DB, dbDesc, ignoredTables)
  1232  		if err != nil {
  1233  			return errors.Wrapf(err, "checking if database %s is empty during restore cleanup", dbDesc.Name)
  1234  		}
  1235  
  1236  		if isDBEmpty {
  1237  			descKey := sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, dbDesc.ID)
  1238  			b.Del(descKey)
  1239  			b.Del(sqlbase.NewDatabaseKey(dbDesc.Name).Key(keys.SystemSQLCodec))
  1240  		}
  1241  	}
  1242  	if err := txn.Run(ctx, b); err != nil {
  1243  		return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel")
  1244  	}
  1245  
  1246  	return nil
  1247  }
  1248  
  1249  // restoreSystemTables atomically replaces the contents of the system tables
  1250  // with the data from the restored system tables.
  1251  func (r *restoreResumer) restoreSystemTables(ctx context.Context) error {
  1252  	executor := r.execCfg.InternalExecutor
  1253  	var err error
  1254  	for _, systemTable := range fullClusterSystemTables {
  1255  		systemTxn := r.execCfg.DB.NewTxn(ctx, "system-restore-txn")
  1256  		txnDebugName := fmt.Sprintf("restore-system-systemTable-%s", systemTable)
  1257  		// Don't clear the jobs table as to not delete the jobs that are performing
  1258  		// the restore.
  1259  		if systemTable != sqlbase.JobsTable.Name {
  1260  			deleteQuery := fmt.Sprintf("DELETE FROM system.%s WHERE true;", systemTable)
  1261  			_, err = executor.Exec(ctx, txnDebugName+"-data-deletion", systemTxn, deleteQuery)
  1262  			if err != nil {
  1263  				return errors.Wrapf(err, "restoring system.%s", systemTable)
  1264  			}
  1265  		}
  1266  		restoreQuery := fmt.Sprintf("INSERT INTO system.%s (SELECT * FROM %s.%s);", systemTable, restoreTempSystemDB, systemTable)
  1267  		_, err = executor.Exec(ctx, txnDebugName+"-data-insert", systemTxn, restoreQuery)
  1268  		if err != nil {
  1269  			return errors.Wrap(err, "restoring system tables")
  1270  		}
  1271  		err = systemTxn.Commit(ctx)
  1272  		if err != nil {
  1273  			return errors.Wrap(err, "committing system systemTable restoration")
  1274  		}
  1275  	}
  1276  
  1277  	// After restoring the system tables, drop the temporary database holding the
  1278  	// system tables.
  1279  	dropTableQuery := fmt.Sprintf("DROP DATABASE %s CASCADE", restoreTempSystemDB)
  1280  	_, err = executor.Exec(ctx, "drop-temp-system-db" /* opName */, nil /* txn */, dropTableQuery)
  1281  	if err != nil {
  1282  		return errors.Wrap(err, "dropping temporary system db")
  1283  	}
  1284  
  1285  	return nil
  1286  }
  1287  
  1288  var _ jobs.Resumer = &restoreResumer{}
  1289  
  1290  func init() {
  1291  	jobs.RegisterConstructor(
  1292  		jobspb.TypeRestore,
  1293  		func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
  1294  			return &restoreResumer{
  1295  				job:      job,
  1296  				settings: settings,
  1297  			}
  1298  		},
  1299  	)
  1300  }