github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/import_processor_test.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/import_processor_test.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package importccl
    10  
    11  import (
    12  	"context"
    13  	"fmt"
    14  	"io/ioutil"
    15  	"math"
    16  	"net/url"
    17  	"os"
    18  	"sort"
    19  	"testing"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/base"
    23  	"github.com/cockroachdb/cockroach/pkg/blobs"
    24  	"github.com/cockroachdb/cockroach/pkg/ccl/backupccl"
    25  	"github.com/cockroachdb/cockroach/pkg/jobs"
    26  	"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
    27  	"github.com/cockroachdb/cockroach/pkg/kv"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    29  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    30  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/distsql"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    37  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    38  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    39  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    40  	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
    41  	"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
    42  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    43  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    44  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    45  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    46  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    47  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    48  	"github.com/cockroachdb/errors"
    49  	"github.com/stretchr/testify/assert"
    50  	"github.com/stretchr/testify/require"
    51  )
    52  
    53  type testSpec struct {
    54  	format roachpb.IOFileFormat
    55  	inputs map[int32]string
    56  	tables map[string]*execinfrapb.ReadImportDataSpec_ImportTable
    57  }
    58  
    59  // Given test spec returns ReadImportDataSpec suitable creating input converter.
    60  func (spec *testSpec) getConverterSpec() *execinfrapb.ReadImportDataSpec {
    61  	return &execinfrapb.ReadImportDataSpec{
    62  		Format:            spec.format,
    63  		Tables:            spec.tables,
    64  		Uri:               spec.inputs,
    65  		ReaderParallelism: 1, // Make tests deterministic
    66  	}
    67  }
    68  
    69  func TestConverterFlushesBatches(t *testing.T) {
    70  	defer leaktest.AfterTest(t)()
    71  	// Reset batch size setting upon test completion.
    72  	defer row.TestingSetDatumRowConverterBatchSize(0)()
    73  
    74  	// Helper to generate test name.
    75  	testName := func(format roachpb.IOFileFormat, batchSize int) string {
    76  		switch batchSize {
    77  		case 0:
    78  			return fmt.Sprintf("%s-default-batch-size", format.Format)
    79  		case 1:
    80  			return fmt.Sprintf("%s-always-flush", format.Format)
    81  		default:
    82  			return fmt.Sprintf("%s-flush-%d-records", format.Format, batchSize)
    83  		}
    84  	}
    85  
    86  	ctx := context.Background()
    87  	evalCtx := tree.MakeTestingEvalContext(nil)
    88  
    89  	tests := []testSpec{
    90  		newTestSpec(t, csvFormat(), "testdata/csv/data-0"),
    91  		newTestSpec(t, mysqlDumpFormat(), "testdata/mysqldump/simple.sql"),
    92  		newTestSpec(t, pgDumpFormat(), "testdata/pgdump/simple.sql"),
    93  		newTestSpec(t, avroFormat(t, roachpb.AvroOptions_OCF), "testdata/avro/simple.ocf"),
    94  	}
    95  
    96  	const endBatchSize = -1
    97  
    98  	for _, testCase := range tests {
    99  		expectedNumRecords := 0
   100  		expectedNumBatches := 0
   101  		converterSpec := testCase.getConverterSpec()
   102  
   103  		// Run multiple tests, increasing batch size until it exceeds the
   104  		// total number of records. When batch size is 0, we run converters
   105  		// with the default batch size, and use that run to figure out the
   106  		// expected number of records and batches for the subsequent run.
   107  		for batchSize := 0; batchSize != endBatchSize; {
   108  			t.Run(testName(testCase.format, batchSize), func(t *testing.T) {
   109  				if batchSize > 0 {
   110  					row.TestingSetDatumRowConverterBatchSize(batchSize)
   111  				}
   112  
   113  				kvCh := make(chan row.KVBatch, batchSize)
   114  				conv, err := makeInputConverter(ctx, converterSpec, &evalCtx, kvCh)
   115  				if err != nil {
   116  					t.Fatalf("makeInputConverter() error = %v", err)
   117  				}
   118  
   119  				group := ctxgroup.WithContext(ctx)
   120  				group.Go(func() error {
   121  					defer close(kvCh)
   122  					return conv.readFiles(ctx, testCase.inputs, nil, converterSpec.Format, externalStorageFactory)
   123  				})
   124  
   125  				lastBatch := 0
   126  				testNumRecords := 0
   127  				testNumBatches := 0
   128  
   129  				// Read from the channel; we expect batches of testCase.batchSize
   130  				// size, with the exception of the last batch.
   131  				for batch := range kvCh {
   132  					if batchSize > 0 {
   133  						assert.True(t, lastBatch == 0 || lastBatch == batchSize)
   134  					}
   135  					lastBatch = len(batch.KVs)
   136  					testNumRecords += lastBatch
   137  					testNumBatches++
   138  				}
   139  				if err := group.Wait(); err != nil {
   140  					t.Fatalf("Conversion failed: %v", err)
   141  				}
   142  
   143  				if batchSize == 0 {
   144  					expectedNumRecords = testNumRecords
   145  					// Next batch: flush every record.
   146  					batchSize = 1
   147  					expectedNumBatches = expectedNumRecords
   148  				} else if batchSize > expectedNumRecords {
   149  					// Done with this test case.
   150  					batchSize = endBatchSize
   151  					return
   152  				} else {
   153  					// Number of records and batches ought to be correct.
   154  					assert.Equal(t, expectedNumRecords, testNumRecords)
   155  					assert.Equal(t, expectedNumBatches, testNumBatches)
   156  
   157  					// Progressively increase the batch size.
   158  					batchSize += (batchSize << 2)
   159  					expectedNumBatches = int(math.Ceil(float64(expectedNumRecords) / float64(batchSize)))
   160  				}
   161  			})
   162  		}
   163  	}
   164  }
   165  
   166  // A RowReceiver implementation which fails the test if it receives an error.
   167  type errorReportingRowReceiver struct {
   168  	t *testing.T
   169  }
   170  
   171  var _ execinfra.RowReceiver = &errorReportingRowReceiver{}
   172  
   173  func (r *errorReportingRowReceiver) Push(
   174  	row sqlbase.EncDatumRow, meta *execinfrapb.ProducerMetadata,
   175  ) execinfra.ConsumerStatus {
   176  	if r.t.Failed() || (meta != nil && meta.Err != nil) {
   177  		if !r.t.Failed() {
   178  			r.t.Fail()
   179  		}
   180  		r.t.Logf("receiver got an error: %v", meta.Err)
   181  		return execinfra.ConsumerClosed
   182  	}
   183  	return execinfra.NeedMoreRows
   184  }
   185  
   186  func (r *errorReportingRowReceiver) ProducerDone() {}
   187  func (r *errorReportingRowReceiver) Types() []*types.T {
   188  	return nil
   189  }
   190  
   191  // A do nothing bulk adder implementation.
   192  type doNothingKeyAdder struct {
   193  	onKeyAdd func(key roachpb.Key)
   194  	onFlush  func()
   195  }
   196  
   197  var _ kvserverbase.BulkAdder = &doNothingKeyAdder{}
   198  
   199  func (a *doNothingKeyAdder) Add(_ context.Context, k roachpb.Key, _ []byte) error {
   200  	if a.onKeyAdd != nil {
   201  		a.onKeyAdd(k)
   202  	}
   203  	return nil
   204  }
   205  func (a *doNothingKeyAdder) Flush(_ context.Context) error {
   206  	if a.onFlush != nil {
   207  		a.onFlush()
   208  	}
   209  	return nil
   210  }
   211  
   212  func (*doNothingKeyAdder) IsEmpty() bool                     { return true }
   213  func (*doNothingKeyAdder) CurrentBufferFill() float32        { return 0 }
   214  func (*doNothingKeyAdder) GetSummary() roachpb.BulkOpSummary { return roachpb.BulkOpSummary{} }
   215  func (*doNothingKeyAdder) Close(_ context.Context)           {}
   216  func (a *doNothingKeyAdder) SetOnFlush(f func())             { a.onFlush = f }
   217  
   218  var eofOffset int64 = math.MaxInt64
   219  
   220  func TestImportIgnoresProcessedFiles(t *testing.T) {
   221  	defer leaktest.AfterTest(t)()
   222  
   223  	evalCtx := tree.MakeTestingEvalContext(nil)
   224  	flowCtx := &execinfra.FlowCtx{
   225  		EvalCtx: &evalCtx,
   226  		Cfg: &execinfra.ServerConfig{
   227  			Settings:        &cluster.Settings{},
   228  			ExternalStorage: externalStorageFactory,
   229  			BulkAdder: func(
   230  				_ context.Context, _ *kv.DB, _ hlc.Timestamp,
   231  				_ kvserverbase.BulkAdderOptions) (kvserverbase.BulkAdder, error) {
   232  				return &doNothingKeyAdder{}, nil
   233  			},
   234  		},
   235  	}
   236  
   237  	// In this test, we'll specify import files that do not exist, but mark
   238  	// those files fully processed. The converters should not attempt to even
   239  	// open these files (and if they do, we should report a test failure)
   240  	tests := []struct {
   241  		name         string
   242  		spec         testSpec
   243  		inputOffsets []int64 // List of file ids that were fully processed
   244  	}{
   245  		{
   246  			"csv-two-invalid",
   247  			newTestSpec(t, csvFormat(), "__invalid__", "testdata/csv/data-0", "/_/missing/_"),
   248  			[]int64{eofOffset, 0, eofOffset},
   249  		},
   250  		{
   251  			"csv-all-invalid",
   252  			newTestSpec(t, csvFormat(), "__invalid__", "../../&"),
   253  			[]int64{eofOffset, eofOffset},
   254  		},
   255  		{
   256  			"csv-all-valid",
   257  			newTestSpec(t, csvFormat(), "testdata/csv/data-0"),
   258  			[]int64{0},
   259  		},
   260  		{
   261  			"mysql-one-invalid",
   262  			newTestSpec(t, mysqlDumpFormat(), "testdata/mysqldump/simple.sql", "/_/missing/_"),
   263  			[]int64{0, eofOffset},
   264  		},
   265  		{
   266  			"pgdump-one-input",
   267  			newTestSpec(t, pgDumpFormat(), "testdata/pgdump/simple.sql"),
   268  			[]int64{0},
   269  		},
   270  		{
   271  			"avro-one-invalid",
   272  			newTestSpec(t, avroFormat(t, roachpb.AvroOptions_OCF), "__invalid__", "testdata/avro/simple.ocf"),
   273  			[]int64{eofOffset, 0},
   274  		},
   275  	}
   276  
   277  	// Configures import spec to have appropriate input offsets set.
   278  	setInputOffsets := func(
   279  		t *testing.T, spec *execinfrapb.ReadImportDataSpec, offsets []int64,
   280  	) *execinfrapb.ReadImportDataSpec {
   281  		if len(spec.Uri) != len(offsets) {
   282  			t.Fatal("Expected matching number of input offsets")
   283  		}
   284  		spec.ResumePos = make(map[int32]int64)
   285  		for id, offset := range offsets {
   286  			if offset > 0 {
   287  				spec.ResumePos[int32(id)] = offset
   288  			}
   289  		}
   290  		return spec
   291  	}
   292  
   293  	for _, testCase := range tests {
   294  		t.Run(fmt.Sprintf("processes-files-once-%s", testCase.name), func(t *testing.T) {
   295  			spec := setInputOffsets(t, testCase.spec.getConverterSpec(), testCase.inputOffsets)
   296  
   297  			processor, err := newReadImportDataProcessor(flowCtx, 0, *spec, &errorReportingRowReceiver{t})
   298  
   299  			if err != nil {
   300  				t.Fatalf("Could not create data processor: %v", err)
   301  			}
   302  
   303  			processor.Run(context.Background())
   304  		})
   305  	}
   306  }
   307  
   308  type observedKeys struct {
   309  	syncutil.Mutex
   310  	keys []roachpb.Key
   311  }
   312  
   313  func TestImportHonorsResumePosition(t *testing.T) {
   314  	defer leaktest.AfterTest(t)()
   315  
   316  	batchSize := 13
   317  	defer row.TestingSetDatumRowConverterBatchSize(batchSize)()
   318  
   319  	pkBulkAdder := &doNothingKeyAdder{}
   320  
   321  	evalCtx := tree.MakeTestingEvalContext(nil)
   322  	flowCtx := &execinfra.FlowCtx{
   323  		EvalCtx: &evalCtx,
   324  		Cfg: &execinfra.ServerConfig{
   325  			Settings:        &cluster.Settings{},
   326  			ExternalStorage: externalStorageFactory,
   327  			BulkAdder: func(
   328  				_ context.Context, _ *kv.DB, _ hlc.Timestamp,
   329  				opts kvserverbase.BulkAdderOptions) (kvserverbase.BulkAdder, error) {
   330  				if opts.Name == "pkAdder" {
   331  					return pkBulkAdder, nil
   332  				}
   333  				return &doNothingKeyAdder{}, nil
   334  			},
   335  			TestingKnobs: execinfra.TestingKnobs{
   336  				BulkAdderFlushesEveryBatch: true,
   337  			},
   338  		},
   339  	}
   340  
   341  	// In this test, we'll specify various resume positions for
   342  	// different input formats. We expect that the rows before resume
   343  	// position will be skipped.
   344  	// NB: We assume that the (external) test files are sorted and
   345  	// contain sufficient number of rows.
   346  	testSpecs := []testSpec{
   347  		newTestSpec(t, csvFormat(), "testdata/csv/data-0"),
   348  		newTestSpec(t, mysqlDumpFormat(), "testdata/mysqldump/simple.sql"),
   349  		newTestSpec(t, mysqlOutFormat(), "testdata/mysqlout/csv-ish/simple.txt"),
   350  		newTestSpec(t, pgCopyFormat(), "testdata/pgcopy/default/test.txt"),
   351  		newTestSpec(t, pgDumpFormat(), "testdata/pgdump/simple.sql"),
   352  		newTestSpec(t, avroFormat(t, roachpb.AvroOptions_JSON_RECORDS), "testdata/avro/simple-sorted.json"),
   353  	}
   354  
   355  	resumes := []int64{0, 10, 64, eofOffset}
   356  
   357  	for _, testCase := range testSpecs {
   358  		spec := testCase.getConverterSpec()
   359  		keys := &observedKeys{keys: make([]roachpb.Key, 0, 1000)}
   360  		numKeys := 0
   361  
   362  		for _, resumePos := range resumes {
   363  			spec.ResumePos = map[int32]int64{0: resumePos}
   364  			if resumePos == 0 {
   365  				// We use 0 resume position to record the set of keys in the input file.
   366  				pkBulkAdder.onKeyAdd = func(k roachpb.Key) {
   367  					keys.Lock()
   368  					keys.keys = append(keys.keys, k)
   369  					keys.Unlock()
   370  				}
   371  			} else {
   372  				if resumePos != eofOffset && resumePos > int64(numKeys) {
   373  					t.Logf("test skipped: resume position %d > number of keys %d", resumePos, numKeys)
   374  					continue
   375  				}
   376  
   377  				// For other resume positions, we want to ensure that
   378  				// the key we add is not among [0 - resumePos) keys.
   379  				pkBulkAdder.onKeyAdd = func(k roachpb.Key) {
   380  					maxKeyIdx := int(resumePos)
   381  					if resumePos == eofOffset {
   382  						maxKeyIdx = numKeys
   383  					}
   384  					keys.Lock()
   385  					idx := sort.Search(maxKeyIdx, func(i int) bool { return keys.keys[i].Compare(k) == 0 })
   386  					if idx < maxKeyIdx {
   387  						t.Errorf("failed to skip key[%d]=%s", idx, k)
   388  					}
   389  					keys.Unlock()
   390  				}
   391  			}
   392  
   393  			t.Run(fmt.Sprintf("resume-%v-%v", spec.Format.Format, resumePos), func(t *testing.T) {
   394  				rp := resumePos
   395  				progCh := make(chan execinfrapb.RemoteProducerMetadata_BulkProcessorProgress)
   396  				defer close(progCh)
   397  
   398  				// Setup progress consumer.
   399  				go func() {
   400  					// Consume progress reports. Since we expect every batch to be flushed
   401  					// (BulkAdderFlushesEveryBatch), then the progress resport must be emitted every
   402  					// batchSize rows (possibly out of order), starting from our initial resumePos
   403  					for prog := range progCh {
   404  						if !t.Failed() && prog.ResumePos[0] < (rp+int64(batchSize)) {
   405  							t.Logf("unexpected progress resume pos: %d", prog.ResumePos[0])
   406  							t.Fail()
   407  						}
   408  					}
   409  				}()
   410  
   411  				_, err := runImport(context.Background(), flowCtx, spec, progCh)
   412  
   413  				if err != nil {
   414  					t.Fatal(err)
   415  				}
   416  			})
   417  
   418  			if resumePos == 0 {
   419  				// Even though the input is assumed to be sorted, we may still observe
   420  				// bulk adder keys arriving out of order.  We need to sort the keys.
   421  				keys.Lock()
   422  				sort.Slice(keys.keys, func(i int, j int) bool {
   423  					return keys.keys[i].Compare(keys.keys[j]) < 0
   424  				})
   425  				numKeys = len(keys.keys)
   426  				keys.Unlock()
   427  			}
   428  		}
   429  	}
   430  }
   431  
   432  // syncBarrier allows 2 threads (a controller and a worker) to
   433  // synchronize between themselves. A controller portion of the
   434  // barrier waits until worker starts running, and then notifies
   435  // worker to proceed. The worker is the opposite: notifies controller
   436  // that it started running, and waits for the proceed signal.
   437  type syncBarrier interface {
   438  	// Enter blocks the barrier, and returns a function
   439  	// that, when executed, unblocks the other thread.
   440  	Enter() func()
   441  }
   442  
   443  type barrier struct {
   444  	read       <-chan struct{}
   445  	write      chan<- struct{}
   446  	controller bool
   447  }
   448  
   449  // Returns controller/worker barriers.
   450  func newSyncBarrier() (syncBarrier, syncBarrier) {
   451  	p1 := make(chan struct{})
   452  	p2 := make(chan struct{})
   453  	return &barrier{p1, p2, true}, &barrier{p2, p1, false}
   454  }
   455  
   456  func (b *barrier) Enter() func() {
   457  	if b.controller {
   458  		b.write <- struct{}{}
   459  		return func() { <-b.read }
   460  	}
   461  
   462  	<-b.read
   463  	return func() { b.write <- struct{}{} }
   464  }
   465  
   466  // A special jobs.Resumer that, instead of finishing
   467  // the job successfully, forces the job to be paused.
   468  var _ jobs.Resumer = &cancellableImportResumer{}
   469  
   470  type cancellableImportResumer struct {
   471  	ctx              context.Context
   472  	jobIDCh          chan int64
   473  	jobID            int64
   474  	onSuccessBarrier syncBarrier
   475  	wrapped          *importResumer
   476  }
   477  
   478  func (r *cancellableImportResumer) Resume(
   479  	_ context.Context, phs interface{}, resultsCh chan<- tree.Datums,
   480  ) error {
   481  	r.jobID = *r.wrapped.job.ID()
   482  	r.jobIDCh <- r.jobID
   483  	if err := r.wrapped.Resume(r.ctx, phs, resultsCh); err != nil {
   484  		return err
   485  	}
   486  	if r.onSuccessBarrier != nil {
   487  		defer r.onSuccessBarrier.Enter()()
   488  	}
   489  	return errors.New("job succeed, but we're forcing it to be paused")
   490  }
   491  
   492  func (r *cancellableImportResumer) OnFailOrCancel(ctx context.Context, phs interface{}) error {
   493  	// This callback is invoked when an error or cancellation occurs
   494  	// during the import. Since our Resume handler returned an
   495  	// error (after pausing the job), we need to short-circuits
   496  	// jobs machinery so that this job is not marked as failed.
   497  	return errors.New("bail out")
   498  }
   499  
   500  func setImportReaderParallelism(parallelism int32) func() {
   501  	factory := rowexec.NewReadImportDataProcessor
   502  	rowexec.NewReadImportDataProcessor = func(
   503  		flowCtx *execinfra.FlowCtx, processorID int32,
   504  		spec execinfrapb.ReadImportDataSpec, output execinfra.RowReceiver) (execinfra.Processor, error) {
   505  		spec.ReaderParallelism = parallelism
   506  		return factory(flowCtx, processorID, spec, output)
   507  	}
   508  
   509  	return func() {
   510  		rowexec.NewReadImportDataProcessor = factory
   511  	}
   512  }
   513  
   514  // Queries the status and the import progress of the job.
   515  type jobState struct {
   516  	err    error
   517  	status jobs.Status
   518  	prog   jobspb.ImportProgress
   519  }
   520  
   521  func queryJob(db sqlutils.DBHandle, jobID int64) (js jobState) {
   522  	js = jobState{
   523  		err:    nil,
   524  		status: "",
   525  		prog:   jobspb.ImportProgress{},
   526  	}
   527  	var progressBytes, payloadBytes []byte
   528  	js.err = db.QueryRowContext(
   529  		context.Background(), "SELECT status, payload, progress FROM system.jobs WHERE id = $1", jobID).Scan(
   530  		&js.status, &payloadBytes, &progressBytes)
   531  	if js.err != nil {
   532  		return
   533  	}
   534  
   535  	if js.status == jobs.StatusFailed {
   536  		payload := &jobspb.Payload{}
   537  		js.err = protoutil.Unmarshal(payloadBytes, payload)
   538  		if js.err == nil {
   539  			js.err = errors.Newf("%s", payload.Error)
   540  		}
   541  		return
   542  	}
   543  
   544  	progress := &jobspb.Progress{}
   545  	if js.err = protoutil.Unmarshal(progressBytes, progress); js.err != nil {
   546  		return
   547  	}
   548  	js.prog = *(progress.Details.(*jobspb.Progress_Import).Import)
   549  	return
   550  }
   551  
   552  // Repeatedly queries job status/progress until specified function returns true.
   553  func queryJobUntil(
   554  	t *testing.T, db sqlutils.DBHandle, jobID int64, isDone func(js jobState) bool,
   555  ) (js jobState) {
   556  	t.Helper()
   557  	for r := retry.Start(base.DefaultRetryOptions()); r.Next(); {
   558  		js = queryJob(db, jobID)
   559  		if js.err != nil || isDone(js) {
   560  			break
   561  		}
   562  	}
   563  	if js.err != nil {
   564  		t.Fatal(js.err)
   565  	}
   566  	return
   567  }
   568  
   569  func TestCSVImportCanBeResumed(t *testing.T) {
   570  	defer leaktest.AfterTest(t)()
   571  	defer setImportReaderParallelism(1)()
   572  	const batchSize = 5
   573  	defer TestingSetParallelImporterReaderBatchSize(batchSize)()
   574  	defer row.TestingSetDatumRowConverterBatchSize(2 * batchSize)()
   575  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   576  
   577  	s, db, _ := serverutils.StartServer(t,
   578  		base.TestServerArgs{
   579  			Knobs: base.TestingKnobs{
   580  				RegistryLiveness: jobs.NewFakeNodeLiveness(1),
   581  				DistSQL: &execinfra.TestingKnobs{
   582  					BulkAdderFlushesEveryBatch: true,
   583  				},
   584  			},
   585  		})
   586  	registry := s.JobRegistry().(*jobs.Registry)
   587  	ctx := context.Background()
   588  	defer s.Stopper().Stop(ctx)
   589  
   590  	sqlDB := sqlutils.MakeSQLRunner(db)
   591  	sqlDB.Exec(t, `CREATE DATABASE d`)
   592  	sqlDB.Exec(t, "CREATE TABLE t (id INT, data STRING)")
   593  	defer sqlDB.Exec(t, `DROP TABLE t`)
   594  
   595  	jobCtx, cancelImport := context.WithCancel(ctx)
   596  	jobIDCh := make(chan int64)
   597  	var jobID int64 = -1
   598  	var importSummary backupccl.RowCount
   599  
   600  	registry.TestingResumerCreationKnobs = map[jobspb.Type]func(raw jobs.Resumer) jobs.Resumer{
   601  		// Arrange for our special job resumer to be
   602  		// returned the very first time we start the import.
   603  		jobspb.TypeImport: func(raw jobs.Resumer) jobs.Resumer {
   604  
   605  			resumer := raw.(*importResumer)
   606  			resumer.testingKnobs.ignoreProtectedTimestamps = true
   607  			resumer.testingKnobs.alwaysFlushJobProgress = true
   608  			resumer.testingKnobs.afterImport = func(summary backupccl.RowCount) error {
   609  				importSummary = summary
   610  				return nil
   611  			}
   612  			if jobID == -1 {
   613  				return &cancellableImportResumer{
   614  					ctx:     jobCtx,
   615  					jobIDCh: jobIDCh,
   616  					wrapped: resumer,
   617  				}
   618  			}
   619  			return resumer
   620  		},
   621  	}
   622  
   623  	testBarrier, csvBarrier := newSyncBarrier()
   624  	csv1 := newCsvGenerator(0, 10*batchSize+1, &intGenerator{}, &strGenerator{})
   625  	csv1.addBreakpoint(7*batchSize, func() (bool, error) {
   626  		defer csvBarrier.Enter()()
   627  		return false, nil
   628  	})
   629  
   630  	// Convince distsql to use our "external" storage implementation.
   631  	storage := newGeneratedStorage(csv1)
   632  	s.DistSQLServer().(*distsql.ServerImpl).ServerConfig.ExternalStorage = storage.externalStorageFactory()
   633  
   634  	// Execute import; ignore any errors returned
   635  	// (since we're aborting the first import run.).
   636  	go func() {
   637  		_, _ = sqlDB.DB.ExecContext(ctx,
   638  			`IMPORT INTO t (id, data) CSV DATA ($1)`, storage.getGeneratorURIs()[0])
   639  	}()
   640  
   641  	// Wait for the job to start running
   642  	jobID = <-jobIDCh
   643  
   644  	// Wait until we are blocked handling breakpoint.
   645  	unblockImport := testBarrier.Enter()
   646  	// Wait until we have recorded some job progress.
   647  	js := queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return js.prog.ResumePos[0] > 0 })
   648  
   649  	// Pause the job;
   650  	if err := registry.PauseRequested(ctx, nil, jobID); err != nil {
   651  		t.Fatal(err)
   652  	}
   653  	// Send cancellation and unblock breakpoint.
   654  	cancelImport()
   655  	unblockImport()
   656  
   657  	// Get updated resume position counter.
   658  	js = queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusPaused == js.status })
   659  	resumePos := js.prog.ResumePos[0]
   660  	t.Logf("Resume pos: %v\n", js.prog.ResumePos[0])
   661  
   662  	// Resume the job and wait for it to complete.
   663  	if err := registry.Resume(ctx, nil, jobID); err != nil {
   664  		t.Fatal(err)
   665  	}
   666  	js = queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusSucceeded == js.status })
   667  
   668  	// Verify that the import proceeded from the resumeRow position.
   669  	assert.Equal(t, importSummary.Rows, int64(csv1.numRows)-resumePos)
   670  
   671  	sqlDB.CheckQueryResults(t, `SELECT id FROM t ORDER BY id`,
   672  		sqlDB.QueryStr(t, `SELECT generate_series(0, $1)`, csv1.numRows-1),
   673  	)
   674  }
   675  
   676  func TestCSVImportMarksFilesFullyProcessed(t *testing.T) {
   677  	defer leaktest.AfterTest(t)()
   678  	const batchSize = 5
   679  	defer TestingSetParallelImporterReaderBatchSize(batchSize)()
   680  	defer row.TestingSetDatumRowConverterBatchSize(2 * batchSize)()
   681  	jobs.DefaultAdoptInterval = 100 * time.Millisecond
   682  
   683  	s, db, _ := serverutils.StartServer(t,
   684  		base.TestServerArgs{
   685  			Knobs: base.TestingKnobs{
   686  				RegistryLiveness: jobs.NewFakeNodeLiveness(1),
   687  				DistSQL: &execinfra.TestingKnobs{
   688  					BulkAdderFlushesEveryBatch: true,
   689  				},
   690  			},
   691  		})
   692  	registry := s.JobRegistry().(*jobs.Registry)
   693  	ctx := context.Background()
   694  	defer s.Stopper().Stop(ctx)
   695  
   696  	sqlDB := sqlutils.MakeSQLRunner(db)
   697  	sqlDB.Exec(t, `CREATE DATABASE d`)
   698  	sqlDB.Exec(t, "CREATE TABLE t (id INT, data STRING)")
   699  	defer sqlDB.Exec(t, `DROP TABLE t`)
   700  
   701  	jobIDCh := make(chan int64)
   702  	controllerBarrier, importBarrier := newSyncBarrier()
   703  
   704  	var jobID int64 = -1
   705  	var importSummary backupccl.RowCount
   706  
   707  	registry.TestingResumerCreationKnobs = map[jobspb.Type]func(raw jobs.Resumer) jobs.Resumer{
   708  		// Arrange for our special job resumer to be
   709  		// returned the very first time we start the import.
   710  		jobspb.TypeImport: func(raw jobs.Resumer) jobs.Resumer {
   711  			resumer := raw.(*importResumer)
   712  			resumer.testingKnobs.alwaysFlushJobProgress = true
   713  			resumer.testingKnobs.ignoreProtectedTimestamps = true
   714  			resumer.testingKnobs.afterImport = func(summary backupccl.RowCount) error {
   715  				importSummary = summary
   716  				return nil
   717  			}
   718  			if jobID == -1 {
   719  				return &cancellableImportResumer{
   720  					ctx:              ctx,
   721  					jobIDCh:          jobIDCh,
   722  					onSuccessBarrier: importBarrier,
   723  					wrapped:          resumer,
   724  				}
   725  			}
   726  			return resumer
   727  		},
   728  	}
   729  
   730  	csv1 := newCsvGenerator(0, 10*batchSize+1, &intGenerator{}, &strGenerator{})
   731  	csv2 := newCsvGenerator(0, 20*batchSize-1, &intGenerator{}, &strGenerator{})
   732  	csv3 := newCsvGenerator(0, 1, &intGenerator{}, &strGenerator{})
   733  
   734  	// Convince distsql to use our "external" storage implementation.
   735  	storage := newGeneratedStorage(csv1, csv2, csv3)
   736  	s.DistSQLServer().(*distsql.ServerImpl).ServerConfig.ExternalStorage = storage.externalStorageFactory()
   737  
   738  	// Execute import; ignore any errors returned
   739  	// (since we're aborting the first import run).
   740  	go func() {
   741  		_, _ = sqlDB.DB.ExecContext(ctx,
   742  			`IMPORT INTO t (id, data) CSV DATA ($1, $2, $3)`, storage.getGeneratorURIs()...)
   743  	}()
   744  
   745  	// Wait for the job to start running
   746  	jobID = <-jobIDCh
   747  
   748  	// Tell importer that it can continue with it's onSuccess
   749  	proceedImport := controllerBarrier.Enter()
   750  
   751  	// Pause the job;
   752  	if err := registry.PauseRequested(ctx, nil, jobID); err != nil {
   753  		t.Fatal(err)
   754  	}
   755  
   756  	// All files should have been processed,
   757  	// and the resume position set to maxInt64.
   758  	js := queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusPaused == js.status })
   759  	for _, pos := range js.prog.ResumePos {
   760  		assert.True(t, pos == math.MaxInt64)
   761  	}
   762  
   763  	// Send cancellation and unblock import.
   764  	proceedImport()
   765  
   766  	// Resume the job and wait for it to complete.
   767  	if err := registry.Resume(ctx, nil, jobID); err != nil {
   768  		t.Fatal(err)
   769  	}
   770  	js = queryJobUntil(t, sqlDB.DB, jobID, func(js jobState) bool { return jobs.StatusSucceeded == js.status })
   771  
   772  	// Verify that after resume we have not processed any additional rows.
   773  	assert.Zero(t, importSummary.Rows)
   774  }
   775  
   776  func (ses *generatedStorage) externalStorageFactory() cloud.ExternalStorageFactory {
   777  	return func(_ context.Context, es roachpb.ExternalStorage) (cloud.ExternalStorage, error) {
   778  		uri, err := url.Parse(es.HttpPath.BaseUri)
   779  		if err != nil {
   780  			return nil, err
   781  		}
   782  		id, ok := ses.nameIDMap[uri.Path]
   783  		if !ok {
   784  			id = ses.nextID
   785  			ses.nextID++
   786  			ses.nameIDMap[uri.Path] = id
   787  		}
   788  		return &generatorExternalStorage{conf: es, gen: ses.generators[id]}, nil
   789  	}
   790  }
   791  
   792  // External storage factory needed to run converters.
   793  func externalStorageFactory(
   794  	ctx context.Context, dest roachpb.ExternalStorage,
   795  ) (cloud.ExternalStorage, error) {
   796  	workdir, err := os.Getwd()
   797  	if err != nil {
   798  		return nil, err
   799  	}
   800  	return cloud.MakeExternalStorage(ctx, dest, base.ExternalIODirConfig{},
   801  		nil, blobs.TestBlobServiceClient(workdir))
   802  }
   803  
   804  // Helper to create and initialize testSpec.
   805  func newTestSpec(t *testing.T, format roachpb.IOFileFormat, inputs ...string) testSpec {
   806  	spec := testSpec{
   807  		format: format,
   808  		inputs: make(map[int32]string),
   809  	}
   810  
   811  	// Initialize table descriptor for import. We need valid descriptor to run
   812  	// converters, even though we don't actually import anything in this test.
   813  	var descr *sqlbase.TableDescriptor
   814  	switch format.Format {
   815  	case roachpb.IOFileFormat_CSV:
   816  		descr = descForTable(t,
   817  			"CREATE TABLE simple (i INT PRIMARY KEY, s text )", 10, 20, NoFKs)
   818  	case
   819  		roachpb.IOFileFormat_Mysqldump,
   820  		roachpb.IOFileFormat_MysqlOutfile,
   821  		roachpb.IOFileFormat_PgDump,
   822  		roachpb.IOFileFormat_PgCopy,
   823  		roachpb.IOFileFormat_Avro:
   824  		descr = descForTable(t,
   825  			"CREATE TABLE simple (i INT PRIMARY KEY, s text, b bytea default null)", 10, 20, NoFKs)
   826  	default:
   827  		t.Fatalf("Unsupported input format: %v", format)
   828  	}
   829  
   830  	targetCols := make([]string, len(descr.Columns))
   831  	numCols := 0
   832  	for i, col := range descr.Columns {
   833  		if !col.Hidden {
   834  			targetCols[i] = col.Name
   835  			numCols++
   836  		}
   837  	}
   838  	assert.True(t, numCols > 0)
   839  
   840  	spec.tables = map[string]*execinfrapb.ReadImportDataSpec_ImportTable{
   841  		"simple": {Desc: descr, TargetCols: targetCols[0:numCols]},
   842  	}
   843  
   844  	for id, path := range inputs {
   845  		spec.inputs[int32(id)] = cloud.MakeLocalStorageURI(path)
   846  	}
   847  
   848  	return spec
   849  }
   850  
   851  func pgDumpFormat() roachpb.IOFileFormat {
   852  	return roachpb.IOFileFormat{
   853  		Format: roachpb.IOFileFormat_PgDump,
   854  		PgDump: roachpb.PgDumpOptions{
   855  			MaxRowSize: 64 * 1024,
   856  		},
   857  	}
   858  }
   859  
   860  func pgCopyFormat() roachpb.IOFileFormat {
   861  	return roachpb.IOFileFormat{
   862  		Format: roachpb.IOFileFormat_PgCopy,
   863  		PgCopy: roachpb.PgCopyOptions{
   864  			Delimiter:  '\t',
   865  			Null:       `\N`,
   866  			MaxRowSize: 4096,
   867  		},
   868  	}
   869  }
   870  
   871  func mysqlDumpFormat() roachpb.IOFileFormat {
   872  	return roachpb.IOFileFormat{
   873  		Format: roachpb.IOFileFormat_Mysqldump,
   874  	}
   875  }
   876  
   877  func mysqlOutFormat() roachpb.IOFileFormat {
   878  	return roachpb.IOFileFormat{
   879  		Format: roachpb.IOFileFormat_MysqlOutfile,
   880  		MysqlOut: roachpb.MySQLOutfileOptions{
   881  			FieldSeparator: ',',
   882  			RowSeparator:   '\n',
   883  			HasEscape:      true,
   884  			Escape:         '\\',
   885  			Enclose:        roachpb.MySQLOutfileOptions_Always,
   886  			Encloser:       '"',
   887  		},
   888  	}
   889  }
   890  
   891  func csvFormat() roachpb.IOFileFormat {
   892  	return roachpb.IOFileFormat{
   893  		Format: roachpb.IOFileFormat_CSV,
   894  	}
   895  }
   896  
   897  func avroFormat(t *testing.T, format roachpb.AvroOptions_Format) roachpb.IOFileFormat {
   898  	avro := roachpb.AvroOptions{
   899  		Format:     format,
   900  		StrictMode: false,
   901  	}
   902  
   903  	if format != roachpb.AvroOptions_OCF {
   904  		// Need to load schema for record specific inputs.
   905  		bytes, err := ioutil.ReadFile("testdata/avro/simple-schema.json")
   906  		require.NoError(t, err)
   907  		avro.SchemaJSON = string(bytes)
   908  		avro.RecordSeparator = '\n'
   909  	}
   910  
   911  	return roachpb.IOFileFormat{
   912  		Format: roachpb.IOFileFormat_Avro,
   913  		Avro:   avro,
   914  	}
   915  }