github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/import_processor_test.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Licensed as a CockroachDB Enterprise file under the Cockroach Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt 8 9 package importccl 10 11 import ( 12 "context" 13 "fmt" 14 "io/ioutil" 15 "math" 16 "net/url" 17 "os" 18 "sort" 19 "testing" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/base" 23 "github.com/cockroachdb/cockroach/pkg/blobs" 24 "github.com/cockroachdb/cockroach/pkg/ccl/backupccl" 25 "github.com/cockroachdb/cockroach/pkg/jobs" 26 "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" 27 "github.com/cockroachdb/cockroach/pkg/kv" 28 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 29 "github.com/cockroachdb/cockroach/pkg/roachpb" 30 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 31 "github.com/cockroachdb/cockroach/pkg/sql/distsql" 32 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 33 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 34 "github.com/cockroachdb/cockroach/pkg/sql/row" 35 "github.com/cockroachdb/cockroach/pkg/sql/rowexec" 36 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 37 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 38 "github.com/cockroachdb/cockroach/pkg/sql/types" 39 "github.com/cockroachdb/cockroach/pkg/storage/cloud" 40 "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" 41 "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" 42 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 43 "github.com/cockroachdb/cockroach/pkg/util/hlc" 44 "github.com/cockroachdb/cockroach/pkg/util/leaktest" 45 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 46 "github.com/cockroachdb/cockroach/pkg/util/retry" 47 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 48 "github.com/cockroachdb/errors" 49 "github.com/stretchr/testify/assert" 50 "github.com/stretchr/testify/require" 51 ) 52 53 type testSpec struct { 54 format roachpb.IOFileFormat 55 inputs map[int32]string 56 tables map[string]*execinfrapb.ReadImportDataSpec_ImportTable 57 } 58 59 // Given test spec returns ReadImportDataSpec suitable creating input converter. 60 func (spec *testSpec) getConverterSpec() *execinfrapb.ReadImportDataSpec { 61 return &execinfrapb.ReadImportDataSpec{ 62 Format: spec.format, 63 Tables: spec.tables, 64 Uri: spec.inputs, 65 ReaderParallelism: 1, // Make tests deterministic 66 } 67 } 68 69 func TestConverterFlushesBatches(t *testing.T) { 70 defer leaktest.AfterTest(t)() 71 // Reset batch size setting upon test completion. 72 defer row.TestingSetDatumRowConverterBatchSize(0)() 73 74 // Helper to generate test name. 75 testName := func(format roachpb.IOFileFormat, batchSize int) string { 76 switch batchSize { 77 case 0: 78 return fmt.Sprintf("%s-default-batch-size", format.Format) 79 case 1: 80 return fmt.Sprintf("%s-always-flush", format.Format) 81 default: 82 return fmt.Sprintf("%s-flush-%d-records", format.Format, batchSize) 83 } 84 } 85 86 ctx := context.Background() 87 evalCtx := tree.MakeTestingEvalContext(nil) 88 89 tests := []testSpec{ 90 newTestSpec(t, csvFormat(), "testdata/csv/data-0"), 91 newTestSpec(t, mysqlDumpFormat(), "testdata/mysqldump/simple.sql"), 92 newTestSpec(t, pgDumpFormat(), "testdata/pgdump/simple.sql"), 93 newTestSpec(t, avroFormat(t, roachpb.AvroOptions_OCF), "testdata/avro/simple.ocf"), 94 } 95 96 const endBatchSize = -1 97 98 for _, testCase := range tests { 99 expectedNumRecords := 0 100 expectedNumBatches := 0 101 converterSpec := testCase.getConverterSpec() 102 103 // Run multiple tests, increasing batch size until it exceeds the 104 // total number of records. When batch size is 0, we run converters 105 // with the default batch size, and use that run to figure out the 106 // expected number of records and batches for the subsequent run. 107 for batchSize := 0; batchSize != endBatchSize; { 108 t.Run(testName(testCase.format, batchSize), func(t *testing.T) { 109 if batchSize > 0 { 110 row.TestingSetDatumRowConverterBatchSize(batchSize) 111 } 112 113 kvCh := make(chan row.KVBatch, batchSize) 114 conv, err := makeInputConverter(ctx, converterSpec, &evalCtx, kvCh) 115 if err != nil { 116 t.Fatalf("makeInputConverter() error = %v", err) 117 } 118 119 group := ctxgroup.WithContext(ctx) 120 group.Go(func() error { 121 defer close(kvCh) 122 return conv.readFiles(ctx, testCase.inputs, nil, converterSpec.Format, externalStorageFactory) 123 }) 124 125 lastBatch := 0 126 testNumRecords := 0 127 testNumBatches := 0 128 129 // Read from the channel; we expect batches of testCase.batchSize 130 // size, with the exception of the last batch. 131 for batch := range kvCh { 132 if batchSize > 0 { 133 assert.True(t, lastBatch == 0 || lastBatch == batchSize) 134 } 135 lastBatch = len(batch.KVs) 136 testNumRecords += lastBatch 137 testNumBatches++ 138 } 139 if err := group.Wait(); err != nil { 140 t.Fatalf("Conversion failed: %v", err) 141 } 142 143 if batchSize == 0 { 144 expectedNumRecords = testNumRecords 145 // Next batch: flush every record. 146 batchSize = 1 147 expectedNumBatches = expectedNumRecords 148 } else if batchSize > expectedNumRecords { 149 // Done with this test case. 150 batchSize = endBatchSize 151 return 152 } else { 153 // Number of records and batches ought to be correct. 154 assert.Equal(t, expectedNumRecords, testNumRecords) 155 assert.Equal(t, expectedNumBatches, testNumBatches) 156 157 // Progressively increase the batch size. 158 batchSize += (batchSize << 2) 159 expectedNumBatches = int(math.Ceil(float64(expectedNumRecords) / float64(batchSize))) 160 } 161 }) 162 } 163 } 164 } 165 166 // A RowReceiver implementation which fails the test if it receives an error. 167 type errorReportingRowReceiver struct { 168 t *testing.T 169 } 170 171 var _ execinfra.RowReceiver = &errorReportingRowReceiver{} 172 173 func (r *errorReportingRowReceiver) Push( 174 row sqlbase.EncDatumRow, meta *execinfrapb.ProducerMetadata, 175 ) execinfra.ConsumerStatus { 176 if r.t.Failed() || (meta != nil && meta.Err != nil) { 177 if !r.t.Failed() { 178 r.t.Fail() 179 } 180 r.t.Logf("receiver got an error: %v", meta.Err) 181 return execinfra.ConsumerClosed 182 } 183 return execinfra.NeedMoreRows 184 } 185 186 func (r *errorReportingRowReceiver) ProducerDone() {} 187 func (r *errorReportingRowReceiver) Types() []*types.T { 188 return nil 189 } 190 191 // A do nothing bulk adder implementation. 192 type doNothingKeyAdder struct { 193 onKeyAdd func(key roachpb.Key) 194 onFlush func() 195 } 196 197 var _ kvserverbase.BulkAdder = &doNothingKeyAdder{} 198 199 func (a *doNothingKeyAdder) Add(_ context.Context, k roachpb.Key, _ []byte) error { 200 if a.onKeyAdd != nil { 201 a.onKeyAdd(k) 202 } 203 return nil 204 } 205 func (a *doNothingKeyAdder) Flush(_ context.Context) error { 206 if a.onFlush != nil { 207 a.onFlush() 208 } 209 return nil 210 } 211 212 func (*doNothingKeyAdder) IsEmpty() bool { return true } 213 func (*doNothingKeyAdder) CurrentBufferFill() float32 { return 0 } 214 func (*doNothingKeyAdder) GetSummary() roachpb.BulkOpSummary { return roachpb.BulkOpSummary{} } 215 func (*doNothingKeyAdder) Close(_ context.Context) {} 216 func (a *doNothingKeyAdder) SetOnFlush(f func()) { a.onFlush = f } 217 218 var eofOffset int64 = math.MaxInt64 219 220 func TestImportIgnoresProcessedFiles(t *testing.T) { 221 defer leaktest.AfterTest(t)() 222 223 evalCtx := tree.MakeTestingEvalContext(nil) 224 flowCtx := &execinfra.FlowCtx{ 225 EvalCtx: &evalCtx, 226 Cfg: &execinfra.ServerConfig{ 227 Settings: &cluster.Settings{}, 228 ExternalStorage: externalStorageFactory, 229 BulkAdder: func( 230 _ context.Context, _ *kv.DB, _ hlc.Timestamp, 231 _ kvserverbase.BulkAdderOptions) (kvserverbase.BulkAdder, error) { 232 return &doNothingKeyAdder{}, nil 233 }, 234 }, 235 } 236 237 // In this test, we'll specify import files that do not exist, but mark 238 // those files fully processed. The converters should not attempt to even 239 // open these files (and if they do, we should report a test failure) 240 tests := []struct { 241 name string 242 spec testSpec 243 inputOffsets []int64 // List of file ids that were fully processed 244 }{ 245 { 246 "csv-two-invalid", 247 newTestSpec(t, csvFormat(), "__invalid__", "testdata/csv/data-0", "/_/missing/_"), 248 []int64{eofOffset, 0, eofOffset}, 249 }, 250 { 251 "csv-all-invalid", 252 newTestSpec(t, csvFormat(), "__invalid__", "../../&"), 253 []int64{eofOffset, eofOffset}, 254 }, 255 { 256 "csv-all-valid", 257 newTestSpec(t, csvFormat(), "testdata/csv/data-0"), 258 []int64{0}, 259 }, 260 { 261 "mysql-one-invalid", 262 newTestSpec(t, mysqlDumpFormat(), "testdata/mysqldump/simple.sql", "/_/missing/_"), 263 []int64{0, eofOffset}, 264 }, 265 { 266 "pgdump-one-input", 267 newTestSpec(t, pgDumpFormat(), "testdata/pgdump/simple.sql"), 268 []int64{0}, 269 }, 270 { 271 "avro-one-invalid", 272 newTestSpec(t, avroFormat(t, roachpb.AvroOptions_OCF), "__invalid__", "testdata/avro/simple.ocf"), 273 []int64{eofOffset, 0}, 274 }, 275 } 276 277 // Configures import spec to have appropriate input offsets set. 278 setInputOffsets := func( 279 t *testing.T, spec *execinfrapb.ReadImportDataSpec, offsets []int64, 280 ) *execinfrapb.ReadImportDataSpec { 281 if len(spec.Uri) != len(offsets) { 282 t.Fatal("Expected matching number of input offsets") 283 } 284 spec.ResumePos = make(map[int32]int64) 285 for id, offset := range offsets { 286 if offset > 0 { 287 spec.ResumePos[int32(id)] = offset 288 } 289 } 290 return spec 291 } 292 293 for _, testCase := range tests { 294 t.Run(fmt.Sprintf("processes-files-once-%s", testCase.name), func(t *testing.T) { 295 spec := setInputOffsets(t, testCase.spec.getConverterSpec(), testCase.inputOffsets) 296 297 processor, err := newReadImportDataProcessor(flowCtx, 0, *spec, &errorReportingRowReceiver{t}) 298 299 if err != nil { 300 t.Fatalf("Could not create data processor: %v", err) 301 } 302 303 processor.Run(context.Background()) 304 }) 305 } 306 } 307 308 type observedKeys struct { 309 syncutil.Mutex 310 keys []roachpb.Key 311 } 312 313 func TestImportHonorsResumePosition(t *testing.T) { 314 defer leaktest.AfterTest(t)() 315 316 batchSize := 13 317 defer row.TestingSetDatumRowConverterBatchSize(batchSize)() 318 319 pkBulkAdder := &doNothingKeyAdder{} 320 321 evalCtx := tree.MakeTestingEvalContext(nil) 322 flowCtx := &execinfra.FlowCtx{ 323 EvalCtx: &evalCtx, 324 Cfg: &execinfra.ServerConfig{ 325 Settings: &cluster.Settings{}, 326 ExternalStorage: externalStorageFactory, 327 BulkAdder: func( 328 _ context.Context, _ *kv.DB, _ hlc.Timestamp, 329 opts kvserverbase.BulkAdderOptions) (kvserverbase.BulkAdder, error) { 330 if opts.Name == "pkAdder" { 331 return pkBulkAdder, nil 332 } 333 return &doNothingKeyAdder{}, nil 334 }, 335 TestingKnobs: execinfra.TestingKnobs{ 336 BulkAdderFlushesEveryBatch: true, 337 }, 338 }, 339 } 340 341 // In this test, we'll specify various resume positions for 342 // different input formats. We expect that the rows before resume 343 // position will be skipped. 344 // NB: We assume that the (external) test files are sorted and 345 // contain sufficient number of rows. 346 testSpecs := []testSpec{ 347 newTestSpec(t, csvFormat(), "testdata/csv/data-0"), 348 newTestSpec(t, mysqlDumpFormat(), "testdata/mysqldump/simple.sql"), 349 newTestSpec(t, mysqlOutFormat(), "testdata/mysqlout/csv-ish/simple.txt"), 350 newTestSpec(t, pgCopyFormat(), "testdata/pgcopy/default/test.txt"), 351 newTestSpec(t, pgDumpFormat(), "testdata/pgdump/simple.sql"), 352 newTestSpec(t, avroFormat(t, roachpb.AvroOptions_JSON_RECORDS), "testdata/avro/simple-sorted.json"), 353 } 354 355 resumes := []int64{0, 10, 64, eofOffset} 356 357 for _, testCase := range testSpecs { 358 spec := testCase.getConverterSpec() 359 keys := &observedKeys{keys: make([]roachpb.Key, 0, 1000)} 360 numKeys := 0 361 362 for _, resumePos := range resumes { 363 spec.ResumePos = map[int32]int64{0: resumePos} 364 if resumePos == 0 { 365 // We use 0 resume position to record the set of keys in the input file. 366 pkBulkAdder.onKeyAdd = func(k roachpb.Key) { 367 keys.Lock() 368 keys.keys = append(keys.keys, k) 369 keys.Unlock() 370 } 371 } else { 372 if resumePos != eofOffset && resumePos > int64(numKeys) { 373 t.Logf("test skipped: resume position %d > number of keys %d", resumePos, numKeys) 374 continue 375 } 376 377 // For other resume positions, we want to ensure that 378 // the key we add is not among [0 - resumePos) keys. 379 pkBulkAdder.onKeyAdd = func(k roachpb.Key) { 380 maxKeyIdx := int(resumePos) 381 if resumePos == eofOffset { 382 maxKeyIdx = numKeys 383 } 384 keys.Lock() 385 idx := sort.Search(maxKeyIdx, func(i int) bool { return keys.keys[i].Compare(k) == 0 }) 386 if idx < maxKeyIdx { 387 t.Errorf("failed to skip key[%d]=%s", idx, k) 388 } 389 keys.Unlock() 390 } 391 } 392 393 t.Run(fmt.Sprintf("resume-%v-%v", spec.Format.Format, resumePos), func(t *testing.T) { 394 rp := resumePos 395 progCh := make(chan execinfrapb.RemoteProducerMetadata_BulkProcessorProgress) 396 defer close(progCh) 397 398 // Setup progress consumer. 399 go func() { 400 // Consume progress reports. Since we expect every batch to be flushed 401 // (BulkAdderFlushesEveryBatch), then the progress resport must be emitted every 402 // batchSize rows (possibly out of order), starting from our initial resumePos 403 for prog := range progCh { 404 if !t.Failed() && prog.ResumePos[0] < (rp+int64(batchSize)) { 405 t.Logf("unexpected progress resume pos: %d", prog.ResumePos[0]) 406 t.Fail() 407 } 408 } 409 }() 410 411 _, err := runImport(context.Background(), flowCtx, spec, progCh) 412 413 if err != nil { 414 t.Fatal(err) 415 } 416 }) 417 418 if resumePos == 0 { 419 // Even though the input is assumed to be sorted, we may still observe 420 // bulk adder keys arriving out of order. We need to sort the keys. 421 keys.Lock() 422 sort.Slice(keys.keys, func(i int, j int) bool { 423 return keys.keys[i].Compare(keys.keys[j]) < 0 424 }) 425 numKeys = len(keys.keys) 426 keys.Unlock() 427 } 428 } 429 } 430 } 431 432 // syncBarrier allows 2 threads (a controller and a worker) to 433 // synchronize between themselves. A controller portion of the 434 // barrier waits until worker starts running, and then notifies 435 // worker to proceed. The worker is the opposite: notifies controller 436 // that it started running, and waits for the proceed signal. 437 type syncBarrier interface { 438 // Enter blocks the barrier, and returns a function 439 // that, when executed, unblocks the other thread. 440 Enter() func() 441 } 442 443 type barrier struct { 444 read <-chan struct{} 445 write chan<- struct{} 446 controller bool 447 } 448 449 // Returns controller/worker barriers. 450 func newSyncBarrier() (syncBarrier, syncBarrier) { 451 p1 := make(chan struct{}) 452 p2 := make(chan struct{}) 453 return &barrier{p1, p2, true}, &barrier{p2, p1, false} 454 } 455 456 func (b *barrier) Enter() func() { 457 if b.controller { 458 b.write <- struct{}{} 459 return func() { <-b.read } 460 } 461 462 <-b.read 463 return func() { b.write <- struct{}{} } 464 } 465 466 // A special jobs.Resumer that, instead of finishing 467 // the job successfully, forces the job to be paused. 468 var _ jobs.Resumer = &cancellableImportResumer{} 469 470 type cancellableImportResumer struct { 471 ctx context.Context 472 jobIDCh chan int64 473 jobID int64 474 onSuccessBarrier syncBarrier 475 wrapped *importResumer 476 } 477 478 func (r *cancellableImportResumer) Resume( 479 _ context.Context, phs interface{}, resultsCh chan<- tree.Datums, 480 ) error { 481 r.jobID = *r.wrapped.job.ID() 482 r.jobIDCh <- r.jobID 483 if err := r.wrapped.Resume(r.ctx, phs, resultsCh); err != nil { 484 return err 485 } 486 if r.onSuccessBarrier != nil { 487 defer r.onSuccessBarrier.Enter()() 488 } 489 return errors.New("job succeed, but we're forcing it to be paused") 490 } 491 492 func (r *cancellableImportResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error { 493 // This callback is invoked when an error or cancellation occurs 494 // during the import. Since our Resume handler returned an 495 // error (after pausing the job), we need to short-circuits 496 // jobs machinery so that this job is not marked as failed. 497 return errors.New("bail out") 498 } 499 500 func setImportReaderParallelism(parallelism int32) func() { 501 factory := rowexec.NewReadImportDataProcessor 502 rowexec.NewReadImportDataProcessor = func( 503 flowCtx *execinfra.FlowCtx, processorID int32, 504 spec execinfrapb.ReadImportDataSpec, output execinfra.RowReceiver) (execinfra.Processor, error) { 505 spec.ReaderParallelism = parallelism 506 return factory(flowCtx, processorID, spec, output) 507 } 508 509 return func() { 510 rowexec.NewReadImportDataProcessor = factory 511 } 512 } 513 514 // Queries the status and the import progress of the job. 515 type jobState struct { 516 err error 517 status jobs.Status 518 prog jobspb.ImportProgress 519 } 520 521 func queryJob(db sqlutils.DBHandle, jobID int64) (js jobState) { 522 js = jobState{ 523 err: nil, 524 status: "", 525 prog: jobspb.ImportProgress{}, 526 } 527 var progressBytes, payloadBytes []byte 528 js.err = db.QueryRowContext( 529 context.Background(), "SELECT status, payload, progress FROM system.jobs WHERE id = $1", jobID).Scan( 530 &js.status, &payloadBytes, &progressBytes) 531 if js.err != nil { 532 return 533 } 534 535 if js.status == jobs.StatusFailed { 536 payload := &jobspb.Payload{} 537 js.err = protoutil.Unmarshal(payloadBytes, payload) 538 if js.err == nil { 539 js.err = errors.Newf("%s", payload.Error) 540 } 541 return 542 } 543 544 progress := &jobspb.Progress{} 545 if js.err = protoutil.Unmarshal(progressBytes, progress); js.err != nil { 546 return 547 } 548 js.prog = *(progress.Details.(*jobspb.Progress_Import).Import) 549 return 550 } 551 552 // Repeatedly queries job status/progress until specified function returns true. 553 func queryJobUntil( 554 t *testing.T, db sqlutils.DBHandle, jobID int64, isDone func(js jobState) bool, 555 ) (js jobState) { 556 t.Helper() 557 for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { 558 js = queryJob(db, jobID) 559 if js.err != nil || isDone(js) { 560 break 561 } 562 } 563 if js.err != nil { 564 t.Fatal(js.err) 565 } 566 return 567 } 568 569 func TestCSVImportCanBeResumed(t *testing.T) { 570 defer leaktest.AfterTest(t)() 571 defer setImportReaderParallelism(1)() 572 const batchSize = 5 573 defer TestingSetParallelImporterReaderBatchSize(batchSize)() 574 defer row.TestingSetDatumRowConverterBatchSize(2 * batchSize)() 575 jobs.DefaultAdoptInterval = 100 * time.Millisecond 576 577 s, db, _ := serverutils.StartServer(t, 578 base.TestServerArgs{ 579 Knobs: base.TestingKnobs{ 580 RegistryLiveness: jobs.NewFakeNodeLiveness(1), 581 DistSQL: &execinfra.TestingKnobs{ 582 BulkAdderFlushesEveryBatch: true, 583 }, 584 }, 585 }) 586 registry := s.JobRegistry().(*jobs.Registry) 587 ctx := context.Background() 588 defer s.Stopper().Stop(ctx) 589 590 sqlDB := sqlutils.MakeSQLRunner(db) 591 sqlDB.Exec(t, `CREATE DATABASE d`) 592 sqlDB.Exec(t, "CREATE TABLE t (id INT, data STRING)") 593 defer sqlDB.Exec(t, `DROP TABLE t`) 594 595 jobCtx, cancelImport := context.WithCancel(ctx) 596 jobIDCh := make(chan int64) 597 var jobID int64 = -1 598 var importSummary backupccl.RowCount 599 600 registry.TestingResumerCreationKnobs = map[jobspb.Type]func(raw jobs.Resumer) jobs.Resumer{ 601 // Arrange for our special job resumer to be 602 // returned the very first time we start the import. 603 jobspb.TypeImport: func(raw jobs.Resumer) jobs.Resumer { 604 605 resumer := raw.(*importResumer) 606 resumer.testingKnobs.ignoreProtectedTimestamps = true 607 resumer.testingKnobs.alwaysFlushJobProgress = true 608 resumer.testingKnobs.afterImport = func(summary backupccl.RowCount) error { 609 importSummary = summary 610 return nil 611 } 612 if jobID == -1 { 613 return &cancellableImportResumer{ 614 ctx: jobCtx, 615 jobIDCh: jobIDCh, 616 wrapped: resumer, 617 } 618 } 619 return resumer 620 }, 621 } 622 623 testBarrier, csvBarrier := newSyncBarrier() 624 csv1 := newCsvGenerator(0, 10*batchSize+1, &intGenerator{}, &strGenerator{}) 625 csv1.addBreakpoint(7*batchSize, func() (bool, error) { 626 defer csvBarrier.Enter()() 627 return false, nil 628 }) 629 630 // Convince distsql to use our "external" storage implementation. 631 storage := newGeneratedStorage(csv1) 632 s.DistSQLServer().(*distsql.ServerImpl).ServerConfig.ExternalStorage = storage.externalStorageFactory() 633 634 // Execute import; ignore any errors returned 635 // (since we're aborting the first import run.). 636 go func() { 637 _, _ = sqlDB.DB.ExecContext(ctx, 638 `IMPORT INTO t (id, data) CSV DATA ($1)`, storage.getGeneratorURIs()[0]) 639 }() 640 641 // Wait for the job to start running 642 jobID = <-jobIDCh 643 644 // Wait until we are blocked handling breakpoint. 645 unblockImport := testBarrier.Enter() 646 // Wait until we have recorded some job progress. 647 js := queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return js.prog.ResumePos[0] > 0 }) 648 649 // Pause the job; 650 if err := registry.PauseRequested(ctx, nil, jobID); err != nil { 651 t.Fatal(err) 652 } 653 // Send cancellation and unblock breakpoint. 654 cancelImport() 655 unblockImport() 656 657 // Get updated resume position counter. 658 js = queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusPaused == js.status }) 659 resumePos := js.prog.ResumePos[0] 660 t.Logf("Resume pos: %v\n", js.prog.ResumePos[0]) 661 662 // Resume the job and wait for it to complete. 663 if err := registry.Resume(ctx, nil, jobID); err != nil { 664 t.Fatal(err) 665 } 666 js = queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusSucceeded == js.status }) 667 668 // Verify that the import proceeded from the resumeRow position. 669 assert.Equal(t, importSummary.Rows, int64(csv1.numRows)-resumePos) 670 671 sqlDB.CheckQueryResults(t, `SELECT id FROM t ORDER BY id`, 672 sqlDB.QueryStr(t, `SELECT generate_series(0, $1)`, csv1.numRows-1), 673 ) 674 } 675 676 func TestCSVImportMarksFilesFullyProcessed(t *testing.T) { 677 defer leaktest.AfterTest(t)() 678 const batchSize = 5 679 defer TestingSetParallelImporterReaderBatchSize(batchSize)() 680 defer row.TestingSetDatumRowConverterBatchSize(2 * batchSize)() 681 jobs.DefaultAdoptInterval = 100 * time.Millisecond 682 683 s, db, _ := serverutils.StartServer(t, 684 base.TestServerArgs{ 685 Knobs: base.TestingKnobs{ 686 RegistryLiveness: jobs.NewFakeNodeLiveness(1), 687 DistSQL: &execinfra.TestingKnobs{ 688 BulkAdderFlushesEveryBatch: true, 689 }, 690 }, 691 }) 692 registry := s.JobRegistry().(*jobs.Registry) 693 ctx := context.Background() 694 defer s.Stopper().Stop(ctx) 695 696 sqlDB := sqlutils.MakeSQLRunner(db) 697 sqlDB.Exec(t, `CREATE DATABASE d`) 698 sqlDB.Exec(t, "CREATE TABLE t (id INT, data STRING)") 699 defer sqlDB.Exec(t, `DROP TABLE t`) 700 701 jobIDCh := make(chan int64) 702 controllerBarrier, importBarrier := newSyncBarrier() 703 704 var jobID int64 = -1 705 var importSummary backupccl.RowCount 706 707 registry.TestingResumerCreationKnobs = map[jobspb.Type]func(raw jobs.Resumer) jobs.Resumer{ 708 // Arrange for our special job resumer to be 709 // returned the very first time we start the import. 710 jobspb.TypeImport: func(raw jobs.Resumer) jobs.Resumer { 711 resumer := raw.(*importResumer) 712 resumer.testingKnobs.alwaysFlushJobProgress = true 713 resumer.testingKnobs.ignoreProtectedTimestamps = true 714 resumer.testingKnobs.afterImport = func(summary backupccl.RowCount) error { 715 importSummary = summary 716 return nil 717 } 718 if jobID == -1 { 719 return &cancellableImportResumer{ 720 ctx: ctx, 721 jobIDCh: jobIDCh, 722 onSuccessBarrier: importBarrier, 723 wrapped: resumer, 724 } 725 } 726 return resumer 727 }, 728 } 729 730 csv1 := newCsvGenerator(0, 10*batchSize+1, &intGenerator{}, &strGenerator{}) 731 csv2 := newCsvGenerator(0, 20*batchSize-1, &intGenerator{}, &strGenerator{}) 732 csv3 := newCsvGenerator(0, 1, &intGenerator{}, &strGenerator{}) 733 734 // Convince distsql to use our "external" storage implementation. 735 storage := newGeneratedStorage(csv1, csv2, csv3) 736 s.DistSQLServer().(*distsql.ServerImpl).ServerConfig.ExternalStorage = storage.externalStorageFactory() 737 738 // Execute import; ignore any errors returned 739 // (since we're aborting the first import run). 740 go func() { 741 _, _ = sqlDB.DB.ExecContext(ctx, 742 `IMPORT INTO t (id, data) CSV DATA ($1, $2, $3)`, storage.getGeneratorURIs()...) 743 }() 744 745 // Wait for the job to start running 746 jobID = <-jobIDCh 747 748 // Tell importer that it can continue with it's onSuccess 749 proceedImport := controllerBarrier.Enter() 750 751 // Pause the job; 752 if err := registry.PauseRequested(ctx, nil, jobID); err != nil { 753 t.Fatal(err) 754 } 755 756 // All files should have been processed, 757 // and the resume position set to maxInt64. 758 js := queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusPaused == js.status }) 759 for _, pos := range js.prog.ResumePos { 760 assert.True(t, pos == math.MaxInt64) 761 } 762 763 // Send cancellation and unblock import. 764 proceedImport() 765 766 // Resume the job and wait for it to complete. 767 if err := registry.Resume(ctx, nil, jobID); err != nil { 768 t.Fatal(err) 769 } 770 js = queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusSucceeded == js.status }) 771 772 // Verify that after resume we have not processed any additional rows. 773 assert.Zero(t, importSummary.Rows) 774 } 775 776 func (ses *generatedStorage) externalStorageFactory() cloud.ExternalStorageFactory { 777 return func(_ context.Context, es roachpb.ExternalStorage) (cloud.ExternalStorage, error) { 778 uri, err := url.Parse(es.HttpPath.BaseUri) 779 if err != nil { 780 return nil, err 781 } 782 id, ok := ses.nameIDMap[uri.Path] 783 if !ok { 784 id = ses.nextID 785 ses.nextID++ 786 ses.nameIDMap[uri.Path] = id 787 } 788 return &generatorExternalStorage{conf: es, gen: ses.generators[id]}, nil 789 } 790 } 791 792 // External storage factory needed to run converters. 793 func externalStorageFactory( 794 ctx context.Context, dest roachpb.ExternalStorage, 795 ) (cloud.ExternalStorage, error) { 796 workdir, err := os.Getwd() 797 if err != nil { 798 return nil, err 799 } 800 return cloud.MakeExternalStorage(ctx, dest, base.ExternalIODirConfig{}, 801 nil, blobs.TestBlobServiceClient(workdir)) 802 } 803 804 // Helper to create and initialize testSpec. 805 func newTestSpec(t *testing.T, format roachpb.IOFileFormat, inputs ...string) testSpec { 806 spec := testSpec{ 807 format: format, 808 inputs: make(map[int32]string), 809 } 810 811 // Initialize table descriptor for import. We need valid descriptor to run 812 // converters, even though we don't actually import anything in this test. 813 var descr *sqlbase.TableDescriptor 814 switch format.Format { 815 case roachpb.IOFileFormat_CSV: 816 descr = descForTable(t, 817 "CREATE TABLE simple (i INT PRIMARY KEY, s text )", 10, 20, NoFKs) 818 case 819 roachpb.IOFileFormat_Mysqldump, 820 roachpb.IOFileFormat_MysqlOutfile, 821 roachpb.IOFileFormat_PgDump, 822 roachpb.IOFileFormat_PgCopy, 823 roachpb.IOFileFormat_Avro: 824 descr = descForTable(t, 825 "CREATE TABLE simple (i INT PRIMARY KEY, s text, b bytea default null)", 10, 20, NoFKs) 826 default: 827 t.Fatalf("Unsupported input format: %v", format) 828 } 829 830 targetCols := make([]string, len(descr.Columns)) 831 numCols := 0 832 for i, col := range descr.Columns { 833 if !col.Hidden { 834 targetCols[i] = col.Name 835 numCols++ 836 } 837 } 838 assert.True(t, numCols > 0) 839 840 spec.tables = map[string]*execinfrapb.ReadImportDataSpec_ImportTable{ 841 "simple": {Desc: descr, TargetCols: targetCols[0:numCols]}, 842 } 843 844 for id, path := range inputs { 845 spec.inputs[int32(id)] = cloud.MakeLocalStorageURI(path) 846 } 847 848 return spec 849 } 850 851 func pgDumpFormat() roachpb.IOFileFormat { 852 return roachpb.IOFileFormat{ 853 Format: roachpb.IOFileFormat_PgDump, 854 PgDump: roachpb.PgDumpOptions{ 855 MaxRowSize: 64 * 1024, 856 }, 857 } 858 } 859 860 func pgCopyFormat() roachpb.IOFileFormat { 861 return roachpb.IOFileFormat{ 862 Format: roachpb.IOFileFormat_PgCopy, 863 PgCopy: roachpb.PgCopyOptions{ 864 Delimiter: '\t', 865 Null: `\N`, 866 MaxRowSize: 4096, 867 }, 868 } 869 } 870 871 func mysqlDumpFormat() roachpb.IOFileFormat { 872 return roachpb.IOFileFormat{ 873 Format: roachpb.IOFileFormat_Mysqldump, 874 } 875 } 876 877 func mysqlOutFormat() roachpb.IOFileFormat { 878 return roachpb.IOFileFormat{ 879 Format: roachpb.IOFileFormat_MysqlOutfile, 880 MysqlOut: roachpb.MySQLOutfileOptions{ 881 FieldSeparator: ',', 882 RowSeparator: '\n', 883 HasEscape: true, 884 Escape: '\\', 885 Enclose: roachpb.MySQLOutfileOptions_Always, 886 Encloser: '"', 887 }, 888 } 889 } 890 891 func csvFormat() roachpb.IOFileFormat { 892 return roachpb.IOFileFormat{ 893 Format: roachpb.IOFileFormat_CSV, 894 } 895 } 896 897 func avroFormat(t *testing.T, format roachpb.AvroOptions_Format) roachpb.IOFileFormat { 898 avro := roachpb.AvroOptions{ 899 Format: format, 900 StrictMode: false, 901 } 902 903 if format != roachpb.AvroOptions_OCF { 904 // Need to load schema for record specific inputs. 905 bytes, err := ioutil.ReadFile("testdata/avro/simple-schema.json") 906 require.NoError(t, err) 907 avro.SchemaJSON = string(bytes) 908 avro.RecordSeparator = '\n' 909 } 910 911 return roachpb.IOFileFormat{ 912 Format: roachpb.IOFileFormat_Avro, 913 Avro: avro, 914 } 915 }