github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/backupccl/restore_job.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package backupccl 10 11 import ( 12 "bytes" 13 "context" 14 "fmt" 15 "math" 16 "runtime" 17 "sync/atomic" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/ccl/storageccl" 21 "github.com/cockroachdb/cockroach/pkg/jobs" 22 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 23 "github.com/cockroachdb/cockroach/pkg/keys" 24 "github.com/cockroachdb/cockroach/pkg/kv" 25 "github.com/cockroachdb/cockroach/pkg/roachpb" 26 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 27 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 28 "github.com/cockroachdb/cockroach/pkg/sql" 29 "github.com/cockroachdb/cockroach/pkg/sql/catalog/catalogkv" 30 "github.com/cockroachdb/cockroach/pkg/sql/covering" 31 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 32 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 33 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 34 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 35 "github.com/cockroachdb/cockroach/pkg/sql/stats" 36 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 37 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 38 "github.com/cockroachdb/cockroach/pkg/util/hlc" 39 "github.com/cockroachdb/cockroach/pkg/util/interval" 40 "github.com/cockroachdb/cockroach/pkg/util/log" 41 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 42 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 43 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 44 "github.com/cockroachdb/cockroach/pkg/util/tracing" 45 "github.com/cockroachdb/errors" 46 "github.com/opentracing/opentracing-go" 47 ) 48 49 type intervalSpan roachpb.Span 50 51 var _ interval.Interface = intervalSpan{} 52 53 // ID is part of `interval.Interface` but unused in makeImportSpans. 54 func (ie intervalSpan) ID() uintptr { return 0 } 55 56 // Range is part of `interval.Interface`. 57 func (ie intervalSpan) Range() interval.Range { 58 return interval.Range{Start: []byte(ie.Key), End: []byte(ie.EndKey)} 59 } 60 61 type importEntryType int 62 63 const ( 64 backupSpan importEntryType = iota 65 backupFile 66 tableSpan 67 completedSpan 68 request 69 ) 70 71 type importEntry struct { 72 roachpb.Span 73 entryType importEntryType 74 75 // Only set if entryType is backupSpan 76 start, end hlc.Timestamp 77 78 // Only set if entryType is backupFile 79 dir roachpb.ExternalStorage 80 file BackupManifest_File 81 82 // Only set if entryType is request 83 files []roachpb.ImportRequest_File 84 85 // for progress tracking we assign the spans numbers as they can be executed 86 // out-of-order based on splitAndScatter's scheduling. 87 progressIdx int 88 } 89 90 // makeImportSpans pivots the backups, which are grouped by time, into 91 // spans for import, which are grouped by keyrange. 92 // 93 // The core logic of this is in OverlapCoveringMerge, which accepts sets of 94 // non-overlapping key ranges (aka coverings) each with a payload, and returns 95 // them aligned with the payloads in the same order as in the input. 96 // 97 // Example (input): 98 // - [A, C) backup t0 to t1 -> /file1 99 // - [C, D) backup t0 to t1 -> /file2 100 // - [A, B) backup t1 to t2 -> /file3 101 // - [B, C) backup t1 to t2 -> /file4 102 // - [C, D) backup t1 to t2 -> /file5 103 // - [B, D) requested table data to be restored 104 // 105 // Example (output): 106 // - [A, B) -> /file1, /file3 107 // - [B, C) -> /file1, /file4, requested (note that file1 was split into two ranges) 108 // - [C, D) -> /file2, /file5, requested 109 // 110 // This would be turned into two Import spans, one restoring [B, C) out of 111 // /file1 and /file4, the other restoring [C, D) out of /file2 and /file5. 112 // Nothing is restored out of /file3 and only part of /file1 is used. 113 // 114 // NB: All grouping operates in the pre-rewrite keyspace, meaning the keyranges 115 // as they were backed up, not as they're being restored. 116 // 117 // If a span is not covered, the onMissing function is called with the span and 118 // time missing to determine what error, if any, should be returned. 119 func makeImportSpans( 120 tableSpans []roachpb.Span, 121 backups []BackupManifest, 122 backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo, 123 lowWaterMark roachpb.Key, 124 onMissing func(span covering.Range, start, end hlc.Timestamp) error, 125 ) ([]importEntry, hlc.Timestamp, error) { 126 // Put the covering for the already-completed spans into the 127 // OverlapCoveringMerge input first. Payloads are returned in the same order 128 // that they appear in the input; putting the completedSpan first means we'll 129 // see it first when iterating over the output of OverlapCoveringMerge and 130 // avoid doing unnecessary work. 131 completedCovering := covering.Covering{ 132 { 133 Start: []byte(keys.MinKey), 134 End: []byte(lowWaterMark), 135 Payload: importEntry{entryType: completedSpan}, 136 }, 137 } 138 139 // Put the merged table data covering into the OverlapCoveringMerge input 140 // next. 141 var tableSpanCovering covering.Covering 142 for _, span := range tableSpans { 143 tableSpanCovering = append(tableSpanCovering, covering.Range{ 144 Start: span.Key, 145 End: span.EndKey, 146 Payload: importEntry{ 147 Span: span, 148 entryType: tableSpan, 149 }, 150 }) 151 } 152 153 backupCoverings := []covering.Covering{completedCovering, tableSpanCovering} 154 155 // Iterate over backups creating two coverings for each. First the spans 156 // that were backed up, then the files in the backup. The latter is a subset 157 // when some of the keyranges in the former didn't change since the previous 158 // backup. These alternate (backup1 spans, backup1 files, backup2 spans, 159 // backup2 files) so they will retain that alternation in the output of 160 // OverlapCoveringMerge. 161 var maxEndTime hlc.Timestamp 162 for i, b := range backups { 163 if maxEndTime.Less(b.EndTime) { 164 maxEndTime = b.EndTime 165 } 166 167 var backupNewSpanCovering covering.Covering 168 for _, s := range b.IntroducedSpans { 169 backupNewSpanCovering = append(backupNewSpanCovering, covering.Range{ 170 Start: s.Key, 171 End: s.EndKey, 172 Payload: importEntry{Span: s, entryType: backupSpan, start: hlc.Timestamp{}, end: b.StartTime}, 173 }) 174 } 175 backupCoverings = append(backupCoverings, backupNewSpanCovering) 176 177 var backupSpanCovering covering.Covering 178 for _, s := range b.Spans { 179 backupSpanCovering = append(backupSpanCovering, covering.Range{ 180 Start: s.Key, 181 End: s.EndKey, 182 Payload: importEntry{Span: s, entryType: backupSpan, start: b.StartTime, end: b.EndTime}, 183 }) 184 } 185 backupCoverings = append(backupCoverings, backupSpanCovering) 186 var backupFileCovering covering.Covering 187 188 var storesByLocalityKV map[string]roachpb.ExternalStorage 189 if backupLocalityInfo != nil && backupLocalityInfo[i].URIsByOriginalLocalityKV != nil { 190 storesByLocalityKV = make(map[string]roachpb.ExternalStorage) 191 for kv, uri := range backupLocalityInfo[i].URIsByOriginalLocalityKV { 192 conf, err := cloud.ExternalStorageConfFromURI(uri) 193 if err != nil { 194 return nil, hlc.Timestamp{}, err 195 } 196 storesByLocalityKV[kv] = conf 197 } 198 } 199 for _, f := range b.Files { 200 dir := b.Dir 201 if storesByLocalityKV != nil { 202 if newDir, ok := storesByLocalityKV[f.LocalityKV]; ok { 203 dir = newDir 204 } 205 } 206 backupFileCovering = append(backupFileCovering, covering.Range{ 207 Start: f.Span.Key, 208 End: f.Span.EndKey, 209 Payload: importEntry{ 210 Span: f.Span, 211 entryType: backupFile, 212 dir: dir, 213 file: f, 214 }, 215 }) 216 } 217 backupCoverings = append(backupCoverings, backupFileCovering) 218 } 219 220 // Group ranges covered by backups with ones needed to restore the selected 221 // tables. Note that this breaks intervals up as necessary to align them. 222 // See the function godoc for details. 223 importRanges := covering.OverlapCoveringMerge(backupCoverings) 224 225 // Translate the output of OverlapCoveringMerge into requests. 226 var requestEntries []importEntry 227 rangeLoop: 228 for _, importRange := range importRanges { 229 needed := false 230 var ts hlc.Timestamp 231 var files []roachpb.ImportRequest_File 232 payloads := importRange.Payload.([]interface{}) 233 for _, p := range payloads { 234 ie := p.(importEntry) 235 switch ie.entryType { 236 case completedSpan: 237 continue rangeLoop 238 case tableSpan: 239 needed = true 240 case backupSpan: 241 if ts != ie.start { 242 return nil, hlc.Timestamp{}, errors.Errorf( 243 "no backup covers time [%s,%s) for range [%s,%s) or backups listed out of order (mismatched start time)", 244 ts, ie.start, 245 roachpb.Key(importRange.Start), roachpb.Key(importRange.End)) 246 } 247 ts = ie.end 248 case backupFile: 249 if len(ie.file.Path) > 0 { 250 files = append(files, roachpb.ImportRequest_File{ 251 Dir: ie.dir, 252 Path: ie.file.Path, 253 Sha512: ie.file.Sha512, 254 }) 255 } 256 } 257 } 258 if needed { 259 if ts != maxEndTime { 260 if err := onMissing(importRange, ts, maxEndTime); err != nil { 261 return nil, hlc.Timestamp{}, err 262 } 263 } 264 // If needed is false, we have data backed up that is not necessary 265 // for this restore. Skip it. 266 requestEntries = append(requestEntries, importEntry{ 267 Span: roachpb.Span{Key: importRange.Start, EndKey: importRange.End}, 268 entryType: request, 269 files: files, 270 }) 271 } 272 } 273 return requestEntries, maxEndTime, nil 274 } 275 276 // splitAndScatter creates new ranges for importSpans and scatters replicas and 277 // leaseholders to be as evenly balanced as possible. It does this with some 278 // amount of parallelism but also staying as close to the order in importSpans 279 // as possible (the more out of order, the more work is done if a RESTORE job 280 // loses its lease and has to be restarted). 281 // 282 // At a high level, this is accomplished by splitting and scattering large 283 // "chunks" from the front of importEntries in one goroutine, each of which are 284 // in turn passed to one of many worker goroutines that split and scatter the 285 // individual entries. 286 // 287 // importEntries are sent to readyForImportCh as they are scattered, so letting 288 // that channel send block can be used for backpressure on the splits and 289 // scatters. 290 // 291 // TODO(dan): This logic is largely tuned by running BenchmarkRestore2TB. See if 292 // there's some way to test it without running an O(hour) long benchmark. 293 func splitAndScatter( 294 restoreCtx context.Context, 295 settings *cluster.Settings, 296 db *kv.DB, 297 kr *storageccl.KeyRewriter, 298 numClusterNodes int, 299 importSpans []importEntry, 300 readyForImportCh chan<- importEntry, 301 ) error { 302 var span opentracing.Span 303 ctx, span := tracing.ChildSpan(restoreCtx, "presplit-scatter") 304 defer tracing.FinishSpan(span) 305 306 g := ctxgroup.WithContext(ctx) 307 308 // TODO(dan): This not super principled. I just wanted something that wasn't 309 // a constant and grew slower than linear with the length of importSpans. It 310 // seems to be working well for BenchmarkRestore2TB but worth revisiting. 311 chunkSize := int(math.Sqrt(float64(len(importSpans)))) 312 importSpanChunks := make([][]importEntry, 0, len(importSpans)/chunkSize) 313 for start := 0; start < len(importSpans); { 314 importSpanChunk := importSpans[start:] 315 end := start + chunkSize 316 if end < len(importSpans) { 317 importSpanChunk = importSpans[start:end] 318 } 319 importSpanChunks = append(importSpanChunks, importSpanChunk) 320 start = end 321 } 322 323 importSpanChunksCh := make(chan []importEntry) 324 expirationTime := db.Clock().Now().Add(time.Hour.Nanoseconds(), 0) 325 g.GoCtx(func(ctx context.Context) error { 326 defer close(importSpanChunksCh) 327 for idx, importSpanChunk := range importSpanChunks { 328 // TODO(dan): The structure between this and the below are very 329 // similar. Dedup. 330 chunkKey, err := rewriteBackupSpanKey(kr, importSpanChunk[0].Key) 331 if err != nil { 332 return err 333 } 334 335 // TODO(dan): Really, this should be splitting the Key of the first 336 // entry in the _next_ chunk. 337 log.VEventf(restoreCtx, 1, "presplitting chunk %d of %d", idx, len(importSpanChunks)) 338 if err := db.AdminSplit(ctx, chunkKey, chunkKey, expirationTime); err != nil { 339 return err 340 } 341 342 log.VEventf(restoreCtx, 1, "scattering chunk %d of %d", idx, len(importSpanChunks)) 343 scatterReq := &roachpb.AdminScatterRequest{ 344 RequestHeader: roachpb.RequestHeaderFromSpan(roachpb.Span{ 345 Key: chunkKey, 346 EndKey: chunkKey.Next(), 347 }), 348 // TODO(dan): This is a bit of a hack, but it seems to be an effective 349 // one (see the PR that added it for graphs). As of the commit that 350 // added this, scatter is not very good at actually balancing leases. 351 // This is likely for two reasons: 1) there's almost certainly some 352 // regression in scatter's behavior, it used to work much better and 2) 353 // scatter has to operate by balancing leases for all ranges in a 354 // cluster, but in RESTORE, we really just want it to be balancing the 355 // span being restored into. 356 RandomizeLeases: true, 357 } 358 if _, pErr := kv.SendWrapped(ctx, db.NonTransactionalSender(), scatterReq); pErr != nil { 359 // TODO(dan): Unfortunately, Scatter is still too unreliable to 360 // fail the RESTORE when Scatter fails. I'm uncomfortable that 361 // this could break entirely and not start failing the tests, 362 // but on the bright side, it doesn't affect correctness, only 363 // throughput. 364 log.Errorf(ctx, "failed to scatter chunk %d: %s", idx, pErr.GoError()) 365 } 366 367 select { 368 case <-ctx.Done(): 369 return ctx.Err() 370 case importSpanChunksCh <- importSpanChunk: 371 } 372 } 373 return nil 374 }) 375 376 // TODO(dan): This tries to cover for a bad scatter by having 2 * the number 377 // of nodes in the cluster. Is it necessary? 378 splitScatterWorkers := numClusterNodes * 2 379 var splitScatterStarted uint64 // Only access using atomic. 380 for worker := 0; worker < splitScatterWorkers; worker++ { 381 g.GoCtx(func(ctx context.Context) error { 382 for importSpanChunk := range importSpanChunksCh { 383 for _, importSpan := range importSpanChunk { 384 idx := atomic.AddUint64(&splitScatterStarted, 1) 385 386 newSpanKey, err := rewriteBackupSpanKey(kr, importSpan.Span.Key) 387 if err != nil { 388 return err 389 } 390 391 // TODO(dan): Really, this should be splitting the Key of 392 // the _next_ entry. 393 log.VEventf(restoreCtx, 1, "presplitting %d of %d", idx, len(importSpans)) 394 if err := db.AdminSplit(ctx, newSpanKey, newSpanKey, expirationTime); err != nil { 395 return err 396 } 397 398 log.VEventf(restoreCtx, 1, "scattering %d of %d", idx, len(importSpans)) 399 scatterReq := &roachpb.AdminScatterRequest{ 400 RequestHeader: roachpb.RequestHeaderFromSpan(roachpb.Span{Key: newSpanKey, EndKey: newSpanKey.Next()}), 401 } 402 if _, pErr := kv.SendWrapped(ctx, db.NonTransactionalSender(), scatterReq); pErr != nil { 403 // TODO(dan): Unfortunately, Scatter is still too unreliable to 404 // fail the RESTORE when Scatter fails. I'm uncomfortable that 405 // this could break entirely and not start failing the tests, 406 // but on the bright side, it doesn't affect correctness, only 407 // throughput. 408 log.Errorf(ctx, "failed to scatter %d: %s", idx, pErr.GoError()) 409 } 410 411 select { 412 case <-ctx.Done(): 413 return ctx.Err() 414 case readyForImportCh <- importSpan: 415 } 416 } 417 } 418 return nil 419 }) 420 } 421 422 return g.Wait() 423 } 424 425 // WriteTableDescs writes all the the new descriptors: First the ID -> 426 // TableDescriptor for the new table, then flip (or initialize) the name -> ID 427 // entry so any new queries will use the new one. The tables are assigned the 428 // permissions of their parent database and the user must have CREATE permission 429 // on that database at the time this function is called. 430 func WriteTableDescs( 431 ctx context.Context, 432 txn *kv.Txn, 433 databases []*sqlbase.DatabaseDescriptor, 434 tables []*sqlbase.TableDescriptor, 435 descCoverage tree.DescriptorCoverage, 436 settings *cluster.Settings, 437 extra []roachpb.KeyValue, 438 ) error { 439 ctx, span := tracing.ChildSpan(ctx, "WriteTableDescs") 440 defer tracing.FinishSpan(span) 441 err := func() error { 442 b := txn.NewBatch() 443 wroteDBs := make(map[sqlbase.ID]*sqlbase.DatabaseDescriptor) 444 for _, desc := range databases { 445 // If the restore is not a full cluster restore we cannot know that 446 // the users on the restoring cluster match the ones that were on the 447 // cluster that was backed up. So we wipe the priviledges on the database. 448 if descCoverage != tree.AllDescriptors { 449 desc.Privileges = sqlbase.NewDefaultPrivilegeDescriptor() 450 } 451 wroteDBs[desc.ID] = desc 452 if err := catalogkv.WriteNewDescToBatch(ctx, false /* kvTrace */, settings, b, keys.SystemSQLCodec, desc.ID, desc); err != nil { 453 return err 454 } 455 // Depending on which cluster version we are restoring to, we decide which 456 // namespace table to write the descriptor into. This may cause wrong 457 // behavior if the cluster version is bumped DURING a restore. 458 dKey := sqlbase.MakeDatabaseNameKey(ctx, settings, desc.Name) 459 b.CPut(dKey.Key(keys.SystemSQLCodec), desc.ID, nil) 460 } 461 for i := range tables { 462 // For full cluster restore, keep privileges as they were. 463 if wrote, ok := wroteDBs[tables[i].ParentID]; ok { 464 // Leave the privileges of the temp system tables as 465 // the default. 466 if descCoverage != tree.AllDescriptors || wrote.Name == restoreTempSystemDB { 467 tables[i].Privileges = wrote.GetPrivileges() 468 } 469 } else { 470 parentDB, err := sqlbase.GetDatabaseDescFromID(ctx, txn, keys.SystemSQLCodec, tables[i].ParentID) 471 if err != nil { 472 return errors.Wrapf(err, 473 "failed to lookup parent DB %d", errors.Safe(tables[i].ParentID)) 474 } 475 // We don't check priv's here since we checked them during job planning. 476 477 // On full cluster restore, keep the privs as they are in the backup. 478 if descCoverage != tree.AllDescriptors { 479 // Default is to copy privs from restoring parent db, like CREATE TABLE. 480 // TODO(dt): Make this more configurable. 481 tables[i].Privileges = parentDB.GetPrivileges() 482 } 483 } 484 if err := catalogkv.WriteNewDescToBatch(ctx, false /* kvTrace */, settings, b, keys.SystemSQLCodec, tables[i].ID, tables[i]); err != nil { 485 return err 486 } 487 // Depending on which cluster version we are restoring to, we decide which 488 // namespace table to write the descriptor into. This may cause wrong 489 // behavior if the cluster version is bumped DURING a restore. 490 tkey := sqlbase.MakePublicTableNameKey(ctx, settings, tables[i].ParentID, tables[i].Name) 491 b.CPut(tkey.Key(keys.SystemSQLCodec), tables[i].ID, nil) 492 } 493 for _, kv := range extra { 494 b.InitPut(kv.Key, &kv.Value, false) 495 } 496 if err := txn.Run(ctx, b); err != nil { 497 if errors.HasType(err, (*roachpb.ConditionFailedError)(nil)) { 498 return pgerror.Newf(pgcode.DuplicateObject, "table already exists") 499 } 500 return err 501 } 502 503 for _, table := range tables { 504 if err := table.Validate(ctx, txn, keys.SystemSQLCodec); err != nil { 505 return errors.Wrapf(err, 506 "validate table %d", errors.Safe(table.ID)) 507 } 508 } 509 return nil 510 }() 511 return errors.Wrapf(err, "restoring table desc and namespace entries") 512 } 513 514 // rewriteBackupSpanKey rewrites a backup span start key for the purposes of 515 // splitting up the target key-space to send out the actual work of restoring. 516 // 517 // Keys for the primary index of the top-level table are rewritten to the just 518 // the overall start of the table. That is, /Table/51/1 becomes /Table/51. 519 // 520 // Any suffix of the key that does is not rewritten by kr's configured rewrites 521 // is truncated. For instance if a passed span has key /Table/51/1/77#/53/2/1 522 // but kr only configured with a rewrite for 51, it would return /Table/51/1/77. 523 // Such span boundaries are usually due to a interleaved table which has since 524 // been dropped -- any splits that happened to pick one of its rows live on, but 525 // include an ID of a table that no longer exists. 526 // 527 // Note that the actual restore process (i.e. inside ImportRequest) does not use 528 // these keys -- they are only used to split the key space and distribute those 529 // requests, thus truncation is fine. In the rare case where multiple backup 530 // spans are truncated to the same prefix (i.e. entire spans resided under the 531 // same interleave parent row) we'll generate some no-op splits and route the 532 // work to the same range, but the actual imported data is unaffected. 533 func rewriteBackupSpanKey(kr *storageccl.KeyRewriter, key roachpb.Key) (roachpb.Key, error) { 534 newKey, rewritten, err := kr.RewriteKey(append([]byte(nil), key...), true /* isFromSpan */) 535 if err != nil { 536 return nil, errors.NewAssertionErrorWithWrappedErrf(err, 537 "could not rewrite span start key: %s", key) 538 } 539 if !rewritten && bytes.Equal(newKey, key) { 540 // if nothing was changed, we didn't match the top-level key at all. 541 return nil, errors.AssertionFailedf( 542 "no rewrite for span start key: %s", key) 543 } 544 // Modify all spans that begin at the primary index to instead begin at the 545 // start of the table. That is, change a span start key from /Table/51/1 to 546 // /Table/51. Otherwise a permanently empty span at /Table/51-/Table/51/1 547 // will be created. 548 if b, id, idx, err := keys.TODOSQLCodec.DecodeIndexPrefix(newKey); err != nil { 549 return nil, errors.NewAssertionErrorWithWrappedErrf(err, 550 "could not rewrite span start key: %s", key) 551 } else if idx == 1 && len(b) == 0 { 552 newKey = keys.TODOSQLCodec.TablePrefix(id) 553 } 554 return newKey, nil 555 } 556 557 // restore imports a SQL table (or tables) from sets of non-overlapping sstable 558 // files. 559 func restore( 560 restoreCtx context.Context, 561 db *kv.DB, 562 numClusterNodes int, 563 settings *cluster.Settings, 564 backupManifests []BackupManifest, 565 backupLocalityInfo []jobspb.RestoreDetails_BackupLocalityInfo, 566 endTime hlc.Timestamp, 567 tables []*sqlbase.TableDescriptor, 568 oldTableIDs []sqlbase.ID, 569 spans []roachpb.Span, 570 job *jobs.Job, 571 encryption *roachpb.FileEncryptionOptions, 572 ) (RowCount, error) { 573 // A note about contexts and spans in this method: the top-level context 574 // `restoreCtx` is used for orchestration logging. All operations that carry 575 // out work get their individual contexts. 576 577 mu := struct { 578 syncutil.Mutex 579 res RowCount 580 requestsCompleted []bool 581 highWaterMark int 582 }{ 583 highWaterMark: -1, 584 } 585 586 // Get TableRekeys to use when importing raw data. 587 var rekeys []roachpb.ImportRequest_TableRekey 588 for i := range tables { 589 tableToSerialize := tables[i] 590 newDescBytes, err := protoutil.Marshal(sqlbase.WrapDescriptor(tableToSerialize)) 591 if err != nil { 592 return mu.res, errors.NewAssertionErrorWithWrappedErrf(err, 593 "marshaling descriptor") 594 } 595 rekeys = append(rekeys, roachpb.ImportRequest_TableRekey{ 596 OldID: uint32(oldTableIDs[i]), 597 NewDesc: newDescBytes, 598 }) 599 } 600 kr, err := storageccl.MakeKeyRewriterFromRekeys(rekeys) 601 if err != nil { 602 return mu.res, err 603 } 604 605 // Pivot the backups, which are grouped by time, into requests for import, 606 // which are grouped by keyrange. 607 highWaterMark := job.Progress().Details.(*jobspb.Progress_Restore).Restore.HighWater 608 importSpans, _, err := makeImportSpans(spans, backupManifests, backupLocalityInfo, highWaterMark, errOnMissingRange) 609 if err != nil { 610 return mu.res, errors.Wrapf(err, "making import requests for %d backups", len(backupManifests)) 611 } 612 613 for i := range importSpans { 614 importSpans[i].progressIdx = i 615 } 616 mu.requestsCompleted = make([]bool, len(importSpans)) 617 618 progressLogger := jobs.NewChunkProgressLogger(job, len(importSpans), job.FractionCompleted(), 619 func(progressedCtx context.Context, details jobspb.ProgressDetails) { 620 switch d := details.(type) { 621 case *jobspb.Progress_Restore: 622 mu.Lock() 623 if mu.highWaterMark >= 0 { 624 d.Restore.HighWater = importSpans[mu.highWaterMark].Key 625 } 626 mu.Unlock() 627 default: 628 log.Errorf(progressedCtx, "job payload had unexpected type %T", d) 629 } 630 }) 631 632 pkIDs := make(map[uint64]struct{}) 633 for _, tbl := range tables { 634 pkIDs[roachpb.BulkOpSummaryID(uint64(tbl.ID), uint64(tbl.PrimaryIndex.ID))] = struct{}{} 635 } 636 637 // We're already limiting these on the server-side, but sending all the 638 // Import requests at once would fill up distsender/grpc/something and cause 639 // all sorts of badness (node liveness timeouts leading to mass leaseholder 640 // transfers, poor performance on SQL workloads, etc) as well as log spam 641 // about slow distsender requests. Rate limit them here, too. 642 // 643 // Use the number of cpus across all nodes in the cluster as the number of 644 // outstanding Import requests for the rate limiting. Note that this assumes 645 // all nodes in the cluster have the same number of cpus, but it's okay if 646 // that's wrong. 647 // 648 // TODO(dan): Make this limiting per node. 649 maxConcurrentImports := numClusterNodes * runtime.NumCPU() 650 importsSem := make(chan struct{}, maxConcurrentImports) 651 652 g := ctxgroup.WithContext(restoreCtx) 653 654 // The Import (and resulting AddSSTable) requests made below run on 655 // leaseholders, so presplit and scatter the ranges to balance the work 656 // among many nodes. 657 // 658 // We're about to start off some goroutines that presplit & scatter each 659 // import span. Once split and scattered, the span is submitted to 660 // readyForImportCh to indicate it's ready for Import. Since import is so 661 // much slower, we buffer the channel to keep the split/scatter work from 662 // getting too far ahead. This both naturally rate limits the split/scatters 663 // and bounds the number of empty ranges created if the RESTORE fails (or is 664 // canceled). 665 const presplitLeadLimit = 10 666 readyForImportCh := make(chan importEntry, presplitLeadLimit) 667 g.GoCtx(func(ctx context.Context) error { 668 defer close(readyForImportCh) 669 return splitAndScatter(ctx, settings, db, kr, numClusterNodes, importSpans, readyForImportCh) 670 }) 671 672 requestFinishedCh := make(chan struct{}, len(importSpans)) // enough buffer to never block 673 g.GoCtx(func(ctx context.Context) error { 674 ctx, progressSpan := tracing.ChildSpan(ctx, "progress-log") 675 defer tracing.FinishSpan(progressSpan) 676 return progressLogger.Loop(ctx, requestFinishedCh) 677 }) 678 g.GoCtx(func(ctx context.Context) error { 679 log.Eventf(restoreCtx, "commencing import of data with concurrency %d", maxConcurrentImports) 680 for readyForImportSpan := range readyForImportCh { 681 newSpanKey, err := rewriteBackupSpanKey(kr, readyForImportSpan.Span.Key) 682 if err != nil { 683 return err 684 } 685 idx := readyForImportSpan.progressIdx 686 687 importRequest := &roachpb.ImportRequest{ 688 // Import is a point request because we don't want DistSender to split 689 // it. Assume (but don't require) the entire post-rewrite span is on the 690 // same range. 691 RequestHeader: roachpb.RequestHeader{Key: newSpanKey}, 692 DataSpan: readyForImportSpan.Span, 693 Files: readyForImportSpan.files, 694 EndTime: endTime, 695 Rekeys: rekeys, 696 Encryption: encryption, 697 } 698 699 log.VEventf(restoreCtx, 1, "importing %d of %d", idx, len(importSpans)) 700 701 select { 702 case importsSem <- struct{}{}: 703 case <-ctx.Done(): 704 return ctx.Err() 705 } 706 707 g.GoCtx(func(ctx context.Context) error { 708 ctx, importSpan := tracing.ChildSpan(ctx, "import") 709 log.Event(ctx, "acquired semaphore") 710 defer tracing.FinishSpan(importSpan) 711 defer func() { <-importsSem }() 712 713 importRes, pErr := kv.SendWrapped(ctx, db.NonTransactionalSender(), importRequest) 714 if pErr != nil { 715 return errors.Wrapf(pErr.GoError(), "importing span %v", importRequest.DataSpan) 716 717 } 718 719 mu.Lock() 720 mu.res.add(countRows(importRes.(*roachpb.ImportResponse).Imported, pkIDs)) 721 722 // Assert that we're actually marking the correct span done. See #23977. 723 if !importSpans[idx].Key.Equal(importRequest.DataSpan.Key) { 724 mu.Unlock() 725 return errors.Newf("request %d for span %v (to %v) does not match import span for same idx: %v", 726 idx, importRequest.DataSpan, newSpanKey, importSpans[idx], 727 ) 728 } 729 mu.requestsCompleted[idx] = true 730 for j := mu.highWaterMark + 1; j < len(mu.requestsCompleted) && mu.requestsCompleted[j]; j++ { 731 mu.highWaterMark = j 732 } 733 mu.Unlock() 734 735 requestFinishedCh <- struct{}{} 736 return nil 737 }) 738 } 739 log.Event(restoreCtx, "wait for outstanding imports to finish") 740 return nil 741 }) 742 743 if err := g.Wait(); err != nil { 744 // This leaves the data that did get imported in case the user wants to 745 // retry. 746 // TODO(dan): Build tooling to allow a user to restart a failed restore. 747 return mu.res, errors.Wrapf(err, "importing %d ranges", len(importSpans)) 748 } 749 750 return mu.res, nil 751 } 752 753 // loadBackupSQLDescs extracts the backup descriptors, the latest backup 754 // descriptor, and all the Descriptors for a backup to be restored. It upgrades 755 // the table descriptors to the new FK representation if necessary. FKs that 756 // can't be restored because the necessary tables are missing are omitted; if 757 // skip_missing_foreign_keys was set, we should have aborted the RESTORE and 758 // returned an error prior to this. 759 func loadBackupSQLDescs( 760 ctx context.Context, 761 p sql.PlanHookState, 762 details jobspb.RestoreDetails, 763 encryption *roachpb.FileEncryptionOptions, 764 ) ([]BackupManifest, BackupManifest, []sqlbase.Descriptor, error) { 765 backupManifests, err := loadBackupManifests(ctx, details.URIs, p.ExecCfg().DistSQLSrv.ExternalStorageFromURI, encryption) 766 if err != nil { 767 return nil, BackupManifest{}, nil, err 768 } 769 770 // Upgrade the table descriptors to use the new FK representation. 771 // TODO(lucy, jordan): This should become unnecessary in 20.1 when we stop 772 // writing old-style descs in RestoreDetails (unless a job persists across 773 // an upgrade?). 774 if err := maybeUpgradeTableDescsInBackupManifests(ctx, backupManifests, p.ExecCfg().Codec, true /* skipFKsWithNoMatchingTable */); err != nil { 775 return nil, BackupManifest{}, nil, err 776 } 777 778 allDescs, latestBackupManifest := loadSQLDescsFromBackupsAtTime(backupManifests, details.EndTime) 779 780 var sqlDescs []sqlbase.Descriptor 781 for _, desc := range allDescs { 782 if _, ok := details.TableRewrites[desc.GetID()]; ok { 783 sqlDescs = append(sqlDescs, desc) 784 } 785 } 786 return backupManifests, latestBackupManifest, sqlDescs, nil 787 } 788 789 type restoreResumer struct { 790 job *jobs.Job 791 settings *cluster.Settings 792 databases []*sqlbase.DatabaseDescriptor 793 tables []*sqlbase.TableDescriptor 794 descriptorCoverage tree.DescriptorCoverage 795 latestStats []*stats.TableStatisticProto 796 execCfg *sql.ExecutorConfig 797 } 798 799 // remapRelevantStatistics changes the table ID references in the stats 800 // from those they had in the backed up database to what they should be 801 // in the restored database. 802 // It also selects only the statistics which belong to one of the tables 803 // being restored. If the tableRewrites can re-write the table ID, then that 804 // table is being restored. 805 func remapRelevantStatistics( 806 backup BackupManifest, tableRewrites TableRewriteMap, 807 ) []*stats.TableStatisticProto { 808 relevantTableStatistics := make([]*stats.TableStatisticProto, 0, len(backup.Statistics)) 809 810 for i := range backup.Statistics { 811 stat := backup.Statistics[i] 812 tableRewrite, ok := tableRewrites[stat.TableID] 813 if !ok { 814 // Table re-write not present, so statistic should not be imported. 815 continue 816 } 817 stat.TableID = tableRewrite.TableID 818 relevantTableStatistics = append(relevantTableStatistics, stat) 819 } 820 821 return relevantTableStatistics 822 } 823 824 // isDatabaseEmpty checks if there exists any tables in the given database. 825 // It pretends that the `ignoredTables` do not exist for the purposes of 826 // checking if a database is empty. 827 // 828 // It is used to construct a transaction which deletes a set of tables as well 829 // as some empty databases. However, we want to check that the databases are 830 // empty _after_ the transaction would have completed, so we want to ignore 831 // the tables that we're deleting in the same transaction. It is done this way 832 // to avoid having 2 transactions reading and writing the same keys one right 833 // after the other. 834 func isDatabaseEmpty( 835 ctx context.Context, 836 db *kv.DB, 837 dbDesc *sql.DatabaseDescriptor, 838 ignoredTables map[sqlbase.ID]struct{}, 839 ) (bool, error) { 840 var allDescs []sqlbase.Descriptor 841 if err := db.Txn( 842 ctx, 843 func(ctx context.Context, txn *kv.Txn) error { 844 var err error 845 allDescs, err = allSQLDescriptors(ctx, txn) 846 return err 847 }); err != nil { 848 return false, err 849 } 850 851 for _, desc := range allDescs { 852 if t := desc.Table(hlc.Timestamp{}); t != nil { 853 if _, ok := ignoredTables[t.GetID()]; ok { 854 continue 855 } 856 if t.GetParentID() == dbDesc.ID { 857 return false, nil 858 } 859 } 860 } 861 return true, nil 862 } 863 864 // createImportingTables create the tables that we will restore into. It also 865 // fetches the information from the old tables that we need for the restore. 866 func createImportingTables( 867 ctx context.Context, p sql.PlanHookState, sqlDescs []sqlbase.Descriptor, r *restoreResumer, 868 ) ( 869 []*sqlbase.DatabaseDescriptor, 870 []*sqlbase.TableDescriptor, 871 []sqlbase.ID, 872 []roachpb.Span, 873 error, 874 ) { 875 details := r.job.Details().(jobspb.RestoreDetails) 876 877 var databases []*sqlbase.DatabaseDescriptor 878 var tables []*sqlbase.TableDescriptor 879 var oldTableIDs []sqlbase.ID 880 for _, desc := range sqlDescs { 881 if tableDesc := desc.Table(hlc.Timestamp{}); tableDesc != nil { 882 tables = append(tables, tableDesc) 883 oldTableIDs = append(oldTableIDs, tableDesc.ID) 884 } 885 if dbDesc := desc.GetDatabase(); dbDesc != nil { 886 if rewrite, ok := details.TableRewrites[dbDesc.ID]; ok { 887 dbDesc.ID = rewrite.TableID 888 databases = append(databases, dbDesc) 889 } 890 } 891 } 892 tempSystemDBID := keys.MinNonPredefinedUserDescID 893 for id := range details.TableRewrites { 894 if int(id) > tempSystemDBID { 895 tempSystemDBID = int(id) 896 } 897 } 898 if details.DescriptorCoverage == tree.AllDescriptors { 899 databases = append(databases, &sqlbase.DatabaseDescriptor{ 900 ID: sqlbase.ID(tempSystemDBID), 901 Name: restoreTempSystemDB, 902 Privileges: sqlbase.NewDefaultPrivilegeDescriptor(), 903 }) 904 } 905 906 // We get the spans of the restoring tables _as they appear in the backup_, 907 // that is, in the 'old' keyspace, before we reassign the table IDs. 908 spans := spansForAllTableIndexes(p.ExecCfg().Codec, tables, nil) 909 910 log.Eventf(ctx, "starting restore for %d tables", len(tables)) 911 912 // Assign new IDs and privileges to the tables, and update all references to 913 // use the new IDs. 914 if err := RewriteTableDescs(tables, details.TableRewrites, details.OverrideDB); err != nil { 915 return nil, nil, nil, nil, err 916 } 917 918 for _, desc := range tables { 919 desc.Version++ 920 desc.State = sqlbase.TableDescriptor_OFFLINE 921 desc.OfflineReason = "restoring" 922 } 923 924 if !details.PrepareCompleted { 925 err := p.ExecCfg().DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 926 // Write the new TableDescriptors which are set in the OFFLINE state. 927 if err := WriteTableDescs(ctx, txn, databases, tables, details.DescriptorCoverage, r.settings, nil /* extra */); err != nil { 928 return errors.Wrapf(err, "restoring %d TableDescriptors from %d databases", len(r.tables), len(databases)) 929 } 930 931 details.PrepareCompleted = true 932 details.TableDescs = tables 933 934 // Update the job once all descs have been prepared for ingestion. 935 err := r.job.WithTxn(txn).SetDetails(ctx, details) 936 937 return err 938 }) 939 if err != nil { 940 return nil, nil, nil, nil, err 941 } 942 } 943 944 return databases, tables, oldTableIDs, spans, nil 945 } 946 947 // Resume is part of the jobs.Resumer interface. 948 func (r *restoreResumer) Resume( 949 ctx context.Context, phs interface{}, resultsCh chan<- tree.Datums, 950 ) error { 951 details := r.job.Details().(jobspb.RestoreDetails) 952 p := phs.(sql.PlanHookState) 953 954 backupManifests, latestBackupManifest, sqlDescs, err := loadBackupSQLDescs( 955 ctx, p, details, details.Encryption, 956 ) 957 if err != nil { 958 return err 959 } 960 961 databases, tables, oldTableIDs, spans, err := createImportingTables(ctx, p, sqlDescs, r) 962 if err != nil { 963 return err 964 } 965 r.tables = tables 966 r.descriptorCoverage = details.DescriptorCoverage 967 r.databases = databases 968 r.execCfg = p.ExecCfg() 969 r.latestStats = remapRelevantStatistics(latestBackupManifest, details.TableRewrites) 970 971 if len(r.tables) == 0 { 972 // We have no tables to restore (we are restoring an empty DB). 973 // Since we have already created any new databases that we needed, 974 // we can return without importing any data. 975 log.Warning(ctx, "no tables to restore") 976 return nil 977 } 978 979 numClusterNodes, err := clusterNodeCount(p.ExecCfg().Gossip) 980 if err != nil { 981 return err 982 } 983 984 res, err := restore( 985 ctx, 986 p.ExecCfg().DB, 987 numClusterNodes, 988 p.ExecCfg().Settings, 989 backupManifests, 990 details.BackupLocalityInfo, 991 details.EndTime, 992 tables, 993 oldTableIDs, 994 spans, 995 r.job, 996 details.Encryption, 997 ) 998 if err != nil { 999 return err 1000 } 1001 1002 if err := r.insertStats(ctx); err != nil { 1003 return errors.Wrap(err, "inserting table statistics") 1004 } 1005 1006 if err := r.publishTables(ctx); err != nil { 1007 return err 1008 } 1009 1010 if r.descriptorCoverage == tree.AllDescriptors { 1011 if err := r.restoreSystemTables(ctx); err != nil { 1012 return err 1013 } 1014 } 1015 1016 resultsCh <- tree.Datums{ 1017 tree.NewDInt(tree.DInt(*r.job.ID())), 1018 tree.NewDString(string(jobs.StatusSucceeded)), 1019 tree.NewDFloat(tree.DFloat(1.0)), 1020 tree.NewDInt(tree.DInt(res.Rows)), 1021 tree.NewDInt(tree.DInt(res.IndexEntries)), 1022 tree.NewDInt(tree.DInt(res.DataSize)), 1023 } 1024 1025 // Collect telemetry. 1026 { 1027 telemetry.Count("restore.total.succeeded") 1028 const mb = 1 << 20 1029 sizeMb := res.DataSize / mb 1030 sec := int64(timeutil.Since(timeutil.FromUnixMicros(r.job.Payload().StartedMicros)).Seconds()) 1031 var mbps int64 1032 if sec > 0 { 1033 mbps = mb / sec 1034 } 1035 telemetry.CountBucketed("restore.duration-sec.succeeded", sec) 1036 telemetry.CountBucketed("restore.size-mb.full", sizeMb) 1037 telemetry.CountBucketed("restore.speed-mbps.total", mbps) 1038 telemetry.CountBucketed("restore.speed-mbps.per-node", mbps/int64(numClusterNodes)) 1039 // Tiny restores may skew throughput numbers due to overhead. 1040 if sizeMb > 10 { 1041 telemetry.CountBucketed("restore.speed-mbps.over10mb", mbps) 1042 telemetry.CountBucketed("restore.speed-mbps.over10mb.per-node", mbps/int64(numClusterNodes)) 1043 } 1044 } 1045 return nil 1046 } 1047 1048 // Insert stats re-inserts the table statistics stored in the backup manifest. 1049 func (r *restoreResumer) insertStats(ctx context.Context) error { 1050 details := r.job.Details().(jobspb.RestoreDetails) 1051 if details.StatsInserted { 1052 return nil 1053 } 1054 1055 err := r.execCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1056 if err := stats.InsertNewStats(ctx, r.execCfg.InternalExecutor, txn, r.latestStats); err != nil { 1057 return errors.Wrapf(err, "inserting stats from backup") 1058 } 1059 details.StatsInserted = true 1060 if err := r.job.WithTxn(txn).SetDetails(ctx, details); err != nil { 1061 return errors.Wrapf(err, "updating job marking stats insertion complete") 1062 } 1063 return nil 1064 }) 1065 if err != nil { 1066 return err 1067 } 1068 return nil 1069 } 1070 1071 // publishTables updates the RESTORED tables status from OFFLINE to PUBLIC. 1072 func (r *restoreResumer) publishTables(ctx context.Context) error { 1073 details := r.job.Details().(jobspb.RestoreDetails) 1074 if details.TablesPublished { 1075 return nil 1076 } 1077 log.Event(ctx, "making tables live") 1078 1079 newSchemaChangeJobs := make([]*jobs.StartableJob, 0) 1080 err := r.execCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1081 // Write the new TableDescriptors and flip state over to public so they can be 1082 // accessed. 1083 b := txn.NewBatch() 1084 for _, tbl := range r.tables { 1085 tableDesc := *tbl 1086 tableDesc.Version++ 1087 tableDesc.State = sqlbase.TableDescriptor_PUBLIC 1088 // Convert any mutations that were in progress on the table descriptor 1089 // when the backup was taken, and convert them to schema change jobs. 1090 newJobs, err := createSchemaChangeJobsFromMutations(ctx, r.execCfg.JobRegistry, r.execCfg.Codec, txn, r.job.Payload().Username, &tableDesc) 1091 if err != nil { 1092 return err 1093 } 1094 newSchemaChangeJobs = append(newSchemaChangeJobs, newJobs...) 1095 existingDescVal, err := sqlbase.ConditionalGetTableDescFromTxn(ctx, txn, r.execCfg.Codec, tbl) 1096 if err != nil { 1097 return errors.Wrap(err, "validating table descriptor has not changed") 1098 } 1099 b.CPut( 1100 sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, tableDesc.ID), 1101 sqlbase.WrapDescriptor(&tableDesc), 1102 existingDescVal, 1103 ) 1104 } 1105 1106 if err := txn.Run(ctx, b); err != nil { 1107 return errors.Wrap(err, "publishing tables") 1108 } 1109 1110 // Update and persist the state of the job. 1111 details.TablesPublished = true 1112 if err := r.job.WithTxn(txn).SetDetails(ctx, details); err != nil { 1113 for _, newJob := range newSchemaChangeJobs { 1114 if cleanupErr := newJob.CleanupOnRollback(ctx); cleanupErr != nil { 1115 log.Warningf(ctx, "failed to clean up job %d: %v", newJob.ID(), cleanupErr) 1116 } 1117 } 1118 return errors.Wrap(err, "updating job details after publishing tables") 1119 } 1120 1121 return nil 1122 }) 1123 if err != nil { 1124 return err 1125 } 1126 1127 // Start the schema change jobs we created. 1128 for _, newJob := range newSchemaChangeJobs { 1129 if _, err := newJob.Start(ctx); err != nil { 1130 return err 1131 } 1132 } 1133 1134 // Initiate a run of CREATE STATISTICS. We don't know the actual number of 1135 // rows affected per table, so we use a large number because we want to make 1136 // sure that stats always get created/refreshed here. 1137 for i := range r.tables { 1138 r.execCfg.StatsRefresher.NotifyMutation(r.tables[i].ID, math.MaxInt32 /* rowsAffected */) 1139 } 1140 1141 return nil 1142 } 1143 1144 // OnFailOrCancel is part of the jobs.Resumer interface. Removes KV data that 1145 // has been committed from a restore that has failed or been canceled. It does 1146 // this by adding the table descriptors in DROP state, which causes the schema 1147 // change stuff to delete the keys in the background. 1148 func (r *restoreResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error { 1149 telemetry.Count("restore.total.failed") 1150 telemetry.CountBucketed("restore.duration-sec.failed", 1151 int64(timeutil.Since(timeutil.FromUnixMicros(r.job.Payload().StartedMicros)).Seconds())) 1152 1153 execCfg := phs.(sql.PlanHookState).ExecCfg() 1154 return execCfg.DB.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1155 return r.dropTables(ctx, execCfg.JobRegistry, txn) 1156 }) 1157 } 1158 1159 // dropTables implements the OnFailOrCancel logic. 1160 func (r *restoreResumer) dropTables(ctx context.Context, jr *jobs.Registry, txn *kv.Txn) error { 1161 details := r.job.Details().(jobspb.RestoreDetails) 1162 1163 // No need to mark the tables as dropped if they were not even created in the 1164 // first place. 1165 if !details.PrepareCompleted { 1166 return nil 1167 } 1168 1169 // Needed to trigger the schema change manager. 1170 if err := txn.SetSystemConfigTrigger(); err != nil { 1171 return err 1172 } 1173 1174 b := txn.NewBatch() 1175 // Drop the table descriptors that were created at the start of the restore. 1176 tablesToGC := make([]sqlbase.ID, 0, len(details.TableDescs)) 1177 for _, tbl := range details.TableDescs { 1178 tablesToGC = append(tablesToGC, tbl.ID) 1179 tableDesc := *tbl 1180 tableDesc.Version++ 1181 tableDesc.State = sqlbase.TableDescriptor_DROP 1182 err := sqlbase.RemovePublicTableNamespaceEntry(ctx, txn, keys.SystemSQLCodec, tbl.ParentID, tbl.Name) 1183 if err != nil { 1184 return errors.Wrap(err, "dropping tables caused by restore fail/cancel from public namespace") 1185 } 1186 existingDescVal, err := sqlbase.ConditionalGetTableDescFromTxn(ctx, txn, r.execCfg.Codec, tbl) 1187 if err != nil { 1188 return errors.Wrap(err, "dropping tables caused by restore fail/cancel") 1189 } 1190 b.CPut( 1191 sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, tableDesc.ID), 1192 sqlbase.WrapDescriptor(&tableDesc), 1193 existingDescVal, 1194 ) 1195 } 1196 1197 // Queue a GC job. 1198 // Set the drop time as 1 (ns in Unix time), so that the table gets GC'd 1199 // immediately. 1200 dropTime := int64(1) 1201 gcDetails := jobspb.SchemaChangeGCDetails{} 1202 for _, tableID := range tablesToGC { 1203 gcDetails.Tables = append(gcDetails.Tables, jobspb.SchemaChangeGCDetails_DroppedID{ 1204 ID: tableID, 1205 DropTime: dropTime, 1206 }) 1207 } 1208 gcJobRecord := jobs.Record{ 1209 Description: fmt.Sprintf("GC for %s", r.job.Payload().Description), 1210 Username: r.job.Payload().Username, 1211 DescriptorIDs: tablesToGC, 1212 Details: gcDetails, 1213 Progress: jobspb.SchemaChangeGCProgress{}, 1214 NonCancelable: true, 1215 } 1216 if _, err := jr.CreateJobWithTxn(ctx, gcJobRecord, txn); err != nil { 1217 return err 1218 } 1219 1220 // Drop the database descriptors that were created at the start of the 1221 // restore if they are now empty (i.e. no user created a table in this 1222 // database during the restore). 1223 var isDBEmpty bool 1224 var err error 1225 ignoredTables := make(map[sqlbase.ID]struct{}) 1226 for _, table := range details.TableDescs { 1227 ignoredTables[table.ID] = struct{}{} 1228 } 1229 for _, dbDesc := range r.databases { 1230 // We need to ignore details.TableDescs since we haven't committed the txn that deletes these. 1231 isDBEmpty, err = isDatabaseEmpty(ctx, r.execCfg.DB, dbDesc, ignoredTables) 1232 if err != nil { 1233 return errors.Wrapf(err, "checking if database %s is empty during restore cleanup", dbDesc.Name) 1234 } 1235 1236 if isDBEmpty { 1237 descKey := sqlbase.MakeDescMetadataKey(keys.SystemSQLCodec, dbDesc.ID) 1238 b.Del(descKey) 1239 b.Del(sqlbase.NewDatabaseKey(dbDesc.Name).Key(keys.SystemSQLCodec)) 1240 } 1241 } 1242 if err := txn.Run(ctx, b); err != nil { 1243 return errors.Wrap(err, "dropping tables created at the start of restore caused by fail/cancel") 1244 } 1245 1246 return nil 1247 } 1248 1249 // restoreSystemTables atomically replaces the contents of the system tables 1250 // with the data from the restored system tables. 1251 func (r *restoreResumer) restoreSystemTables(ctx context.Context) error { 1252 executor := r.execCfg.InternalExecutor 1253 var err error 1254 for _, systemTable := range fullClusterSystemTables { 1255 systemTxn := r.execCfg.DB.NewTxn(ctx, "system-restore-txn") 1256 txnDebugName := fmt.Sprintf("restore-system-systemTable-%s", systemTable) 1257 // Don't clear the jobs table as to not delete the jobs that are performing 1258 // the restore. 1259 if systemTable != sqlbase.JobsTable.Name { 1260 deleteQuery := fmt.Sprintf("DELETE FROM system.%s WHERE true;", systemTable) 1261 _, err = executor.Exec(ctx, txnDebugName+"-data-deletion", systemTxn, deleteQuery) 1262 if err != nil { 1263 return errors.Wrapf(err, "restoring system.%s", systemTable) 1264 } 1265 } 1266 restoreQuery := fmt.Sprintf("INSERT INTO system.%s (SELECT * FROM %s.%s);", systemTable, restoreTempSystemDB, systemTable) 1267 _, err = executor.Exec(ctx, txnDebugName+"-data-insert", systemTxn, restoreQuery) 1268 if err != nil { 1269 return errors.Wrap(err, "restoring system tables") 1270 } 1271 err = systemTxn.Commit(ctx) 1272 if err != nil { 1273 return errors.Wrap(err, "committing system systemTable restoration") 1274 } 1275 } 1276 1277 // After restoring the system tables, drop the temporary database holding the 1278 // system tables. 1279 dropTableQuery := fmt.Sprintf("DROP DATABASE %s CASCADE", restoreTempSystemDB) 1280 _, err = executor.Exec(ctx, "drop-temp-system-db" /* opName */, nil /* txn */, dropTableQuery) 1281 if err != nil { 1282 return errors.Wrap(err, "dropping temporary system db") 1283 } 1284 1285 return nil 1286 } 1287 1288 var _ jobs.Resumer = &restoreResumer{} 1289 1290 func init() { 1291 jobs.RegisterConstructor( 1292 jobspb.TypeRestore, 1293 func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer { 1294 return &restoreResumer{ 1295 job: job, 1296 settings: settings, 1297 } 1298 }, 1299 ) 1300 }