github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/ccl/importccl/read_import_mysqlout.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Licensed as a CockroachDB Enterprise file under the Cockroach Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //     https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt
     8  
     9  package importccl
    10  
    11  import (
    12  	"bufio"
    13  	"context"
    14  	"fmt"
    15  	"io"
    16  	"unicode"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    22  	"github.com/cockroachdb/cockroach/pkg/storage/cloud"
    23  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    24  	"github.com/cockroachdb/errors"
    25  )
    26  
    27  type mysqloutfileReader struct {
    28  	importCtx *parallelImportContext
    29  	opts      roachpb.MySQLOutfileOptions
    30  }
    31  
    32  var _ inputConverter = &mysqloutfileReader{}
    33  
    34  func newMysqloutfileReader(
    35  	opts roachpb.MySQLOutfileOptions,
    36  	kvCh chan row.KVBatch,
    37  	walltime int64,
    38  	parallelism int,
    39  	tableDesc *sqlbase.TableDescriptor,
    40  	evalCtx *tree.EvalContext,
    41  ) (*mysqloutfileReader, error) {
    42  	return &mysqloutfileReader{
    43  		importCtx: &parallelImportContext{
    44  			walltime:   walltime,
    45  			numWorkers: parallelism,
    46  			evalCtx:    evalCtx,
    47  			tableDesc:  tableDesc,
    48  			kvCh:       kvCh,
    49  		},
    50  		opts: opts,
    51  	}, nil
    52  }
    53  
    54  func (d *mysqloutfileReader) start(ctx ctxgroup.Group) {
    55  }
    56  
    57  func (d *mysqloutfileReader) readFiles(
    58  	ctx context.Context,
    59  	dataFiles map[int32]string,
    60  	resumePos map[int32]int64,
    61  	format roachpb.IOFileFormat,
    62  	makeExternalStorage cloud.ExternalStorageFactory,
    63  ) error {
    64  	return readInputFiles(ctx, dataFiles, resumePos, format, d.readFile, makeExternalStorage)
    65  }
    66  
    67  type delimitedProducer struct {
    68  	importCtx *parallelImportContext
    69  	opts      *roachpb.MySQLOutfileOptions
    70  	input     *fileReader
    71  	reader    *bufio.Reader
    72  	row       []rune
    73  	err       error
    74  	eof       bool
    75  }
    76  
    77  var _ importRowProducer = &delimitedProducer{}
    78  
    79  // Scan implements importRowProducer
    80  func (d *delimitedProducer) Scan() bool {
    81  	d.row = nil
    82  	var r rune
    83  	var w int
    84  	nextLiteral := false
    85  	fieldEnclosed := false
    86  
    87  	for {
    88  		r, w, d.err = d.reader.ReadRune()
    89  		if d.err == io.EOF {
    90  			d.eof = true
    91  			d.err = nil
    92  		}
    93  
    94  		if d.eof {
    95  			if d.row != nil {
    96  				return true
    97  			}
    98  			if nextLiteral {
    99  				d.err = io.ErrUnexpectedEOF
   100  			}
   101  			return false
   102  		}
   103  
   104  		if d.err != nil {
   105  			return false
   106  		}
   107  
   108  		if r == unicode.ReplacementChar && w == 1 {
   109  			if d.err = d.reader.UnreadRune(); d.err != nil {
   110  				return false
   111  			}
   112  			var raw byte
   113  			raw, d.err = d.reader.ReadByte()
   114  			if d.err != nil {
   115  				return false
   116  			}
   117  			r = rune(raw)
   118  		}
   119  
   120  		if r == d.opts.RowSeparator && !nextLiteral && !fieldEnclosed {
   121  			return true
   122  		}
   123  
   124  		d.row = append(d.row, r)
   125  
   126  		if d.opts.HasEscape {
   127  			nextLiteral = !nextLiteral && r == d.opts.Escape
   128  		}
   129  
   130  		if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && r == d.opts.Encloser {
   131  			// We only care about well formed, enclosed fields (i.e. ones that start with
   132  			// enclose rune.  If we see enclose character anywhere else, then we either
   133  			// close the opened enclosing, or we treat this as an invalid enclosing,
   134  			// and let FillDatums below take care of reporting and handling any errors.
   135  			fieldEnclosed = len(d.row) == 1
   136  		}
   137  	}
   138  }
   139  
   140  // Err implements importRowProducer
   141  func (d *delimitedProducer) Err() error {
   142  	return d.err
   143  }
   144  
   145  // Skip implements importRowProducer
   146  func (d *delimitedProducer) Skip() error {
   147  	return nil // no-op
   148  }
   149  
   150  // Row implements importRowProducer
   151  func (d *delimitedProducer) Row() (interface{}, error) {
   152  	return d.row, d.err
   153  }
   154  
   155  // Progress implements importRowProducer
   156  func (d *delimitedProducer) Progress() float32 {
   157  	return d.input.ReadFraction()
   158  }
   159  
   160  type delimitedConsumer struct {
   161  	opts *roachpb.MySQLOutfileOptions
   162  }
   163  
   164  var _ importRowConsumer = &delimitedConsumer{}
   165  
   166  // FillDatums implements importRowConsumer
   167  func (d *delimitedConsumer) FillDatums(
   168  	input interface{}, rowNum int64, conv *row.DatumRowConverter,
   169  ) error {
   170  	data := input.([]rune)
   171  
   172  	// The current field being read needs to be a list to be able to undo
   173  	// field enclosures at end of field.
   174  	var fieldParts []rune
   175  
   176  	// If we have an escaping char defined, seeing it means the next char is to be
   177  	// treated as escaped -- usually that means literal but has some specific
   178  	// mappings defined as well.
   179  	var nextLiteral bool
   180  
   181  	// If we have an enclosing char defined, seeing it begins reading a field --
   182  	// which means we do not look for separators until we see the end of the field
   183  	// as indicated by the matching enclosing char.
   184  	var readingField bool
   185  
   186  	// If we have just encountered a potential encloser symbol.
   187  	// That means if an end of field or line is next we should honor it.
   188  	var gotEncloser bool
   189  
   190  	var gotNull bool
   191  
   192  	var datumIdx int
   193  
   194  	addField := func() error {
   195  		defer func() {
   196  			fieldParts = fieldParts[:0]
   197  			readingField = false
   198  			gotEncloser = false
   199  		}()
   200  		if nextLiteral {
   201  			return newImportRowError(errors.New("unmatched literal"), string(data), rowNum)
   202  		}
   203  
   204  		var datum tree.Datum
   205  
   206  		// If previous symbol was field encloser it should be
   207  		// dropped as it only marks end of field. Otherwise
   208  		// throw an error since we don;t expect unmatched encloser.
   209  		if gotEncloser {
   210  			// If the encloser marked end of field
   211  			// drop it.
   212  			if readingField {
   213  				fieldParts = fieldParts[:len(fieldParts)-1]
   214  			} else {
   215  				// Unexpected since we did not see one at start of field.
   216  				gotEncloser = false
   217  				return newImportRowError(errors.New("unmatched field enclosure at end of field"),
   218  					string(data), rowNum)
   219  			}
   220  		} else if readingField {
   221  			return newImportRowError(errors.New("unmatched field enclosure at start of field"),
   222  				string(data), rowNum)
   223  		}
   224  		field := string(fieldParts)
   225  		if datumIdx >= len(conv.VisibleCols) {
   226  			return newImportRowError(
   227  				fmt.Errorf("too many columns, got %d expected %d", datumIdx+1, len(conv.VisibleCols)),
   228  				string(data), rowNum)
   229  		}
   230  
   231  		if gotNull {
   232  			gotNull = false
   233  			if len(field) != 0 {
   234  				return newImportRowError(fmt.Errorf("unexpected data after null encoding: %q", field),
   235  					string(data), rowNum)
   236  			}
   237  			datum = tree.DNull
   238  		} else if (!d.opts.HasEscape && field == "NULL") || d.opts.NullEncoding != nil && field == *d.opts.NullEncoding {
   239  			datum = tree.DNull
   240  		} else {
   241  			// This uses ParseDatumStringAsWithRawBytes instead of ParseDatumStringAs since mysql emits
   242  			// raw byte strings that do not use the same escaping as our ParseBytes
   243  			// function expects, and the difference between ParseStringAs and
   244  			// ParseDatumStringAs is whether or not it attempts to parse bytes.
   245  			var err error
   246  			datum, err = sqlbase.ParseDatumStringAsWithRawBytes(conv.VisibleColTypes[datumIdx], field, conv.EvalCtx)
   247  			if err != nil {
   248  				col := conv.VisibleCols[datumIdx]
   249  				return newImportRowError(
   250  					fmt.Errorf("error %s while parse %q as %s", err, col.Name, col.Type.SQLString()),
   251  					string(data), rowNum)
   252  			}
   253  		}
   254  		conv.Datums[datumIdx] = datum
   255  		datumIdx++
   256  		return nil
   257  	}
   258  
   259  	// Main parsing loop body, returns true to indicate unrecoverable error.
   260  	// We are being conservative and treating most errors as unrecoverable for now.
   261  	for _, c := range data {
   262  		// Do we need to check for escaping?
   263  		if d.opts.HasEscape {
   264  			if nextLiteral {
   265  				nextLiteral = false
   266  				// See https://dev.mysql.com/doc/refman/8.0/en/load-data.html.
   267  				switch c {
   268  				case '0':
   269  					fieldParts = append(fieldParts, rune(0))
   270  				case 'b':
   271  					fieldParts = append(fieldParts, rune('\b'))
   272  				case 'n':
   273  					fieldParts = append(fieldParts, rune('\n'))
   274  				case 'r':
   275  					fieldParts = append(fieldParts, rune('\r'))
   276  				case 't':
   277  					fieldParts = append(fieldParts, rune('\t'))
   278  				case 'Z':
   279  					fieldParts = append(fieldParts, rune(byte(26)))
   280  				case 'N':
   281  					if gotNull {
   282  						return newImportRowError(errors.New("unexpected null encoding"), string(data), rowNum)
   283  					}
   284  					gotNull = true
   285  				default:
   286  					fieldParts = append(fieldParts, c)
   287  				}
   288  				gotEncloser = false
   289  				continue
   290  			}
   291  
   292  			if c == d.opts.Escape {
   293  				nextLiteral = true
   294  				gotEncloser = false
   295  				continue
   296  			}
   297  		}
   298  
   299  		// Are we done with the field, or even the whole row?
   300  		if (!readingField || gotEncloser) && c == d.opts.FieldSeparator {
   301  			if err := addField(); err != nil {
   302  				return err
   303  			}
   304  			continue
   305  		}
   306  
   307  		if gotEncloser {
   308  			gotEncloser = false
   309  		}
   310  
   311  		// If enclosing is not disabled, check for the encloser.
   312  		// Technically when it is not optional, we could _require_ it to start and
   313  		// end fields, but for the purposes of decoding, we don't actually care --
   314  		// we'll handle it if we see it either way.
   315  		if d.opts.Enclose != roachpb.MySQLOutfileOptions_Never && c == d.opts.Encloser {
   316  			if !readingField && len(fieldParts) == 0 {
   317  				readingField = true
   318  				continue
   319  			}
   320  			gotEncloser = true
   321  		}
   322  		fieldParts = append(fieldParts, c)
   323  	}
   324  
   325  	if err := addField(); err != nil {
   326  		return err
   327  	}
   328  
   329  	if datumIdx != len(conv.VisibleCols) {
   330  		return newImportRowError(fmt.Errorf(
   331  			"unexpected number of columns, expected %d got %d", len(conv.VisibleCols), datumIdx),
   332  			string(data), rowNum)
   333  	}
   334  
   335  	return nil
   336  }
   337  
   338  func (d *mysqloutfileReader) readFile(
   339  	ctx context.Context, input *fileReader, inputIdx int32, resumePos int64, rejected chan string,
   340  ) error {
   341  	producer := &delimitedProducer{
   342  		importCtx: d.importCtx,
   343  		opts:      &d.opts,
   344  		input:     input,
   345  		reader:    bufio.NewReaderSize(input, 64*1024),
   346  	}
   347  	consumer := &delimitedConsumer{opts: &d.opts}
   348  
   349  	if resumePos < int64(d.opts.Skip) {
   350  		resumePos = int64(d.opts.Skip)
   351  	}
   352  
   353  	fileCtx := &importFileContext{
   354  		source:   inputIdx,
   355  		skip:     resumePos,
   356  		rejected: rejected,
   357  	}
   358  
   359  	return runParallelImport(ctx, d.importCtx, fileCtx, producer, consumer)
   360  }