github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/read_import_csv.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package importccl
    10  
    11  import (
    12  	"context"
    13  	"io"
    14  	"strings"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    17  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    20  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    21  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    22  	"github.com/cockroachdb/cockroach/pkg/util/encoding/csv"
    23  	"github.com/cockroachdb/errors"
    24  )
    25  
    26  type csvInputReader struct {
    27  	importCtx *parallelImportContext
    28  	opts      roachpb.CSVOptions
    29  }
    30  
    31  var _ inputConverter = &csvInputReader{}
    32  
    33  func newCSVInputReader(
    34  	kvCh chan row.KVBatch,
    35  	opts roachpb.CSVOptions,
    36  	walltime int64,
    37  	parallelism int,
    38  	tableDesc *sqlbase.TableDescriptor,
    39  	targetCols tree.NameList,
    40  	evalCtx *tree.EvalContext,
    41  ) *csvInputReader {
    42  	return &csvInputReader{
    43  		importCtx: &parallelImportContext{
    44  			walltime:   walltime,
    45  			numWorkers: parallelism,
    46  			evalCtx:    evalCtx,
    47  			tableDesc:  tableDesc,
    48  			targetCols: targetCols,
    49  			kvCh:       kvCh,
    50  		},
    51  		opts: opts,
    52  	}
    53  }
    54  
    55  func (c *csvInputReader) start(group ctxgroup.Group) {
    56  }
    57  
    58  func (c *csvInputReader) readFiles(
    59  	ctx context.Context,
    60  	dataFiles map[int32]string,
    61  	resumePos map[int32]int64,
    62  	format roachpb.IOFileFormat,
    63  	makeExternalStorage cloud.ExternalStorageFactory,
    64  ) error {
    65  	return readInputFiles(ctx, dataFiles, resumePos, format, c.readFile, makeExternalStorage)
    66  }
    67  
    68  func (c *csvInputReader) readFile(
    69  	ctx context.Context, input *fileReader, inputIdx int32, resumePos int64, rejected chan string,
    70  ) error {
    71  	producer, consumer := newCSVPipeline(c, input)
    72  
    73  	if resumePos < int64(c.opts.Skip) {
    74  		resumePos = int64(c.opts.Skip)
    75  	}
    76  
    77  	fileCtx := &importFileContext{
    78  		source:   inputIdx,
    79  		skip:     resumePos,
    80  		rejected: rejected,
    81  	}
    82  
    83  	return runParallelImport(ctx, c.importCtx, fileCtx, producer, consumer)
    84  }
    85  
    86  type csvRowProducer struct {
    87  	importCtx       *parallelImportContext
    88  	opts            *roachpb.CSVOptions
    89  	csv             *csv.Reader
    90  	rowNum          int64
    91  	err             error
    92  	record          []string
    93  	progress        func() float32
    94  	expectedColumns tree.NameList
    95  }
    96  
    97  var _ importRowProducer = &csvRowProducer{}
    98  
    99  // Scan() implements importRowProducer interface.
   100  func (p *csvRowProducer) Scan() bool {
   101  	p.record, p.err = p.csv.Read()
   102  
   103  	if p.err == io.EOF {
   104  		p.err = nil
   105  		return false
   106  	}
   107  
   108  	return p.err == nil
   109  }
   110  
   111  // Err() implements importRowProducer interface.
   112  func (p *csvRowProducer) Err() error {
   113  	return p.err
   114  }
   115  
   116  // Skip() implements importRowProducer interface.
   117  func (p *csvRowProducer) Skip() error {
   118  	// No-op
   119  	return nil
   120  }
   121  
   122  func strRecord(record []string, sep rune) string {
   123  	csvSep := ","
   124  	if sep != 0 {
   125  		csvSep = string(sep)
   126  	}
   127  	return strings.Join(record, csvSep)
   128  }
   129  
   130  // Row() implements importRowProducer interface.
   131  func (p *csvRowProducer) Row() (interface{}, error) {
   132  	p.rowNum++
   133  	expectedColsLen := len(p.expectedColumns)
   134  	if expectedColsLen == 0 {
   135  		expectedColsLen = len(p.importCtx.tableDesc.VisibleColumns())
   136  	}
   137  
   138  	if len(p.record) == expectedColsLen {
   139  		// Expected number of columns.
   140  	} else if len(p.record) == expectedColsLen+1 && p.record[expectedColsLen] == "" {
   141  		// Line has the optional trailing comma, ignore the empty field.
   142  		p.record = p.record[:expectedColsLen]
   143  	} else {
   144  		return nil, newImportRowError(
   145  			errors.Errorf("expected %d fields, got %d", expectedColsLen, len(p.record)),
   146  			strRecord(p.record, p.opts.Comma),
   147  			p.rowNum)
   148  
   149  	}
   150  	return p.record, nil
   151  }
   152  
   153  // Progress() implements importRowProducer interface.
   154  func (p *csvRowProducer) Progress() float32 {
   155  	return p.progress()
   156  }
   157  
   158  type csvRowConsumer struct {
   159  	importCtx *parallelImportContext
   160  	opts      *roachpb.CSVOptions
   161  }
   162  
   163  var _ importRowConsumer = &csvRowConsumer{}
   164  
   165  // FillDatums() implements importRowConsumer interface
   166  func (c *csvRowConsumer) FillDatums(
   167  	row interface{}, rowNum int64, conv *row.DatumRowConverter,
   168  ) error {
   169  	record := row.([]string)
   170  	datumIdx := 0
   171  
   172  	for i, field := range record {
   173  		// Skip over record entries corresponding to columns not in the target
   174  		// columns specified by the user.
   175  		if _, ok := conv.IsTargetCol[i]; !ok {
   176  			continue
   177  		}
   178  
   179  		if c.opts.NullEncoding != nil &&
   180  			field == *c.opts.NullEncoding {
   181  			conv.Datums[datumIdx] = tree.DNull
   182  		} else {
   183  			var err error
   184  			conv.Datums[datumIdx], err = sqlbase.ParseDatumStringAs(conv.VisibleColTypes[i], field, conv.EvalCtx)
   185  			if err != nil {
   186  				col := conv.VisibleCols[i]
   187  				return newImportRowError(
   188  					errors.Wrapf(err, "parse %q as %s", col.Name, col.Type.SQLString()),
   189  					strRecord(record, c.opts.Comma),
   190  					rowNum)
   191  			}
   192  		}
   193  		datumIdx++
   194  	}
   195  	return nil
   196  }
   197  
   198  func newCSVPipeline(c *csvInputReader, input *fileReader) (*csvRowProducer, *csvRowConsumer) {
   199  	cr := csv.NewReader(input)
   200  	if c.opts.Comma != 0 {
   201  		cr.Comma = c.opts.Comma
   202  	}
   203  	cr.FieldsPerRecord = -1
   204  	cr.LazyQuotes = !c.opts.StrictQuotes
   205  	cr.Comment = c.opts.Comment
   206  
   207  	producer := &csvRowProducer{
   208  		importCtx:       c.importCtx,
   209  		opts:            &c.opts,
   210  		csv:             cr,
   211  		progress:        func() float32 { return input.ReadFraction() },
   212  		expectedColumns: c.importCtx.targetCols,
   213  	}
   214  	consumer := &csvRowConsumer{
   215  		importCtx: c.importCtx,
   216  		opts:      &c.opts,
   217  	}
   218  
   219  	return producer, consumer
   220  }