github.com/pingcap/br@v5.3.0-alpha.0.20220125034240-ec59c7b6ce30+incompatible/pkg/lightning/mydump/csv_parser.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package mydump
    15  
    16  import (
    17  	"bytes"
    18  	"io"
    19  	"strings"
    20  
    21  	"github.com/pingcap/errors"
    22  	"github.com/pingcap/tidb/types"
    23  
    24  	"github.com/pingcap/br/pkg/lightning/config"
    25  	"github.com/pingcap/br/pkg/lightning/worker"
    26  	"github.com/pingcap/br/pkg/utils"
    27  )
    28  
    29  var (
    30  	errUnterminatedQuotedField = errors.NewNoStackError("syntax error: unterminated quoted field")
    31  	errDanglingBackslash       = errors.NewNoStackError("syntax error: no character after backslash")
    32  	errUnexpectedQuoteField    = errors.NewNoStackError("syntax error: cannot have consecutive fields without separator")
    33  )
    34  
    35  // CSVParser is basically a copy of encoding/csv, but special-cased for MySQL-like input.
    36  type CSVParser struct {
    37  	blockParser
    38  	cfg *config.CSVConfig
    39  
    40  	comma   []byte
    41  	quote   []byte
    42  	newLine []byte
    43  
    44  	// These variables are used with IndexAnyByte to search a byte slice for the
    45  	// first index which some special character may appear.
    46  	// quoteByteSet is used inside quoted fields (so the first characters of
    47  	// the closing delimiter and backslash are special).
    48  	// unquoteByteSet is used outside quoted fields (so the first characters
    49  	// of the opening delimiter, separator, terminator and backslash are
    50  	// special).
    51  	// newLineByteSet is used in strict-format CSV dividing (so the first
    52  	// characters of the terminator are special).
    53  	quoteByteSet   byteSet
    54  	unquoteByteSet byteSet
    55  	newLineByteSet byteSet
    56  
    57  	// recordBuffer holds the unescaped fields, one after another.
    58  	// The fields can be accessed by using the indexes in fieldIndexes.
    59  	// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
    60  	// and fieldIndexes will contain the indexes [1, 2, 5, 6].
    61  	recordBuffer []byte
    62  
    63  	// fieldIndexes is an index of fields inside recordBuffer.
    64  	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
    65  	fieldIndexes []int
    66  
    67  	lastRecord []string
    68  
    69  	escFlavor backslashEscapeFlavor
    70  	// if set to true, csv parser will treat the first non-empty line as header line
    71  	shouldParseHeader bool
    72  }
    73  
    74  func NewCSVParser(
    75  	cfg *config.CSVConfig,
    76  	reader ReadSeekCloser,
    77  	blockBufSize int64,
    78  	ioWorkers *worker.Pool,
    79  	shouldParseHeader bool,
    80  ) *CSVParser {
    81  	escFlavor := backslashEscapeFlavorNone
    82  	var quoteStopSet, newLineStopSet []byte
    83  	unquoteStopSet := []byte{cfg.Separator[0]}
    84  	if len(cfg.Delimiter) > 0 {
    85  		quoteStopSet = []byte{cfg.Delimiter[0]}
    86  		unquoteStopSet = append(unquoteStopSet, cfg.Delimiter[0])
    87  	}
    88  	if len(cfg.Terminator) > 0 {
    89  		newLineStopSet = []byte{cfg.Terminator[0]}
    90  	} else {
    91  		newLineStopSet = []byte{'\r', '\n'}
    92  	}
    93  	unquoteStopSet = append(unquoteStopSet, newLineStopSet...)
    94  	if cfg.BackslashEscape {
    95  		escFlavor = backslashEscapeFlavorMySQL
    96  		quoteStopSet = append(quoteStopSet, '\\')
    97  		unquoteStopSet = append(unquoteStopSet, '\\')
    98  		// we need special treatment of the NULL value \N, used by MySQL.
    99  		if !cfg.NotNull && cfg.Null == `\N` {
   100  			escFlavor = backslashEscapeFlavorMySQLWithNull
   101  		}
   102  	}
   103  
   104  	return &CSVParser{
   105  		blockParser:       makeBlockParser(reader, blockBufSize, ioWorkers),
   106  		cfg:               cfg,
   107  		comma:             []byte(cfg.Separator),
   108  		quote:             []byte(cfg.Delimiter),
   109  		newLine:           []byte(cfg.Terminator),
   110  		escFlavor:         escFlavor,
   111  		quoteByteSet:      makeByteSet(quoteStopSet),
   112  		unquoteByteSet:    makeByteSet(unquoteStopSet),
   113  		newLineByteSet:    makeByteSet(newLineStopSet),
   114  		shouldParseHeader: shouldParseHeader,
   115  	}
   116  }
   117  
   118  func (parser *CSVParser) unescapeString(input string) (unescaped string, isNull bool) {
   119  	if parser.escFlavor == backslashEscapeFlavorMySQLWithNull && input == `\N` {
   120  		return input, true
   121  	}
   122  	unescaped = unescape(input, "", parser.escFlavor)
   123  	isNull = parser.escFlavor != backslashEscapeFlavorMySQLWithNull &&
   124  		!parser.cfg.NotNull &&
   125  		unescaped == parser.cfg.Null
   126  	return
   127  }
   128  
   129  // csvToken is a type representing either a normal byte or some CSV-specific
   130  // tokens such as the separator (comma), delimiter (quote) and terminator (new
   131  // line).
   132  type csvToken int16
   133  
   134  const (
   135  	// csvTokenAnyUnquoted is a placeholder to represent any unquoted character.
   136  	csvTokenAnyUnquoted csvToken = 0
   137  	// csvTokenWithBackslash is a mask indicating an escaped character.
   138  	// The actual token is represented like `csvTokenWithBackslash | 'n'`.
   139  	csvTokenWithBackslash csvToken = 0x100
   140  	// csvTokenComma is the CSV separator token.
   141  	csvTokenComma csvToken = 0x200
   142  	// csvTokenNewLine is the CSV terminator token.
   143  	csvTokenNewLine csvToken = 0x400
   144  	// csvTokenDelimiter is the CSV delimiter token.
   145  	csvTokenDelimiter csvToken = 0x800
   146  )
   147  
   148  func (parser *CSVParser) readByte() (byte, error) {
   149  	if len(parser.buf) == 0 {
   150  		if err := parser.readBlock(); err != nil {
   151  			return 0, err
   152  		}
   153  	}
   154  	if len(parser.buf) == 0 {
   155  		return 0, io.EOF
   156  	}
   157  	b := parser.buf[0]
   158  	parser.buf = parser.buf[1:]
   159  	parser.pos++
   160  	return b, nil
   161  }
   162  
   163  func (parser *CSVParser) peekBytes(cnt int) ([]byte, error) {
   164  	if len(parser.buf) < cnt {
   165  		if err := parser.readBlock(); err != nil {
   166  			return nil, err
   167  		}
   168  	}
   169  	if len(parser.buf) == 0 {
   170  		return nil, io.EOF
   171  	}
   172  	cnt = utils.MinInt(cnt, len(parser.buf))
   173  	return parser.buf[:cnt], nil
   174  }
   175  
   176  func (parser *CSVParser) skipBytes(n int) {
   177  	parser.buf = parser.buf[n:]
   178  	parser.pos += int64(n)
   179  }
   180  
   181  // tryReadExact peeks the bytes ahead, and if it matches `content` exactly will
   182  // consume it (advance the cursor) and return `true`.
   183  func (parser *CSVParser) tryReadExact(content []byte) (bool, error) {
   184  	if len(content) == 0 {
   185  		return true, nil
   186  	}
   187  	bs, err := parser.peekBytes(len(content))
   188  	if err == nil {
   189  		if bytes.Equal(bs, content) {
   190  			parser.skipBytes(len(content))
   191  			return true, nil
   192  		}
   193  	} else if errors.Cause(err) == io.EOF {
   194  		err = nil
   195  	}
   196  	return false, err
   197  }
   198  
   199  func (parser *CSVParser) tryReadNewLine(b byte) (bool, error) {
   200  	if len(parser.newLine) == 0 {
   201  		return b == '\r' || b == '\n', nil
   202  	}
   203  	if b != parser.newLine[0] {
   204  		return false, nil
   205  	}
   206  	return parser.tryReadExact(parser.newLine[1:])
   207  }
   208  
   209  func (parser *CSVParser) tryReadOpenDelimiter(b byte) (bool, error) {
   210  	if len(parser.quote) == 0 || parser.quote[0] != b {
   211  		return false, nil
   212  	}
   213  	return parser.tryReadExact(parser.quote[1:])
   214  }
   215  
   216  // tryReadCloseDelimiter is currently equivalent to tryReadOpenDelimiter until
   217  // we support asymmetric delimiters.
   218  func (parser *CSVParser) tryReadCloseDelimiter(b byte) (bool, error) {
   219  	if parser.quote[0] != b {
   220  		return false, nil
   221  	}
   222  	return parser.tryReadExact(parser.quote[1:])
   223  }
   224  
   225  func (parser *CSVParser) tryReadComma(b byte) (bool, error) {
   226  	if parser.comma[0] != b {
   227  		return false, nil
   228  	}
   229  	return parser.tryReadExact(parser.comma[1:])
   230  }
   231  
   232  func (parser *CSVParser) tryReadBackslashed(bs byte) (bool, byte, error) {
   233  	if bs != '\\' || parser.escFlavor == backslashEscapeFlavorNone {
   234  		return false, 0, nil
   235  	}
   236  	b, err := parser.readByte()
   237  	return true, b, parser.replaceEOF(err, errDanglingBackslash)
   238  }
   239  
   240  // readQuoteToken reads a token inside quoted fields.
   241  func (parser *CSVParser) readQuotedToken(b byte) (csvToken, error) {
   242  	if ok, err := parser.tryReadCloseDelimiter(b); ok || err != nil {
   243  		return csvTokenDelimiter, err
   244  	}
   245  	if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil {
   246  		return csvTokenWithBackslash | csvToken(eb), err
   247  	}
   248  	return csvToken(b), nil
   249  }
   250  
   251  // readUnquoteToken reads a token outside quoted fields.
   252  func (parser *CSVParser) readUnquoteToken(b byte) (csvToken, error) {
   253  	if ok, err := parser.tryReadNewLine(b); ok || err != nil {
   254  		return csvTokenNewLine, err
   255  	}
   256  	if ok, err := parser.tryReadComma(b); ok || err != nil {
   257  		return csvTokenComma, err
   258  	}
   259  	if ok, err := parser.tryReadOpenDelimiter(b); ok || err != nil {
   260  		return csvTokenDelimiter, err
   261  	}
   262  	if ok, eb, err := parser.tryReadBackslashed(b); ok || err != nil {
   263  		return csvTokenWithBackslash | csvToken(eb), err
   264  	}
   265  	return csvToken(b), nil
   266  }
   267  
   268  func (parser *CSVParser) appendCSVTokenToRecordBuffer(token csvToken) {
   269  	if token&csvTokenWithBackslash != 0 {
   270  		parser.recordBuffer = append(parser.recordBuffer, '\\')
   271  	}
   272  	parser.recordBuffer = append(parser.recordBuffer, byte(token))
   273  }
   274  
   275  // readUntil reads the buffer until any character from the `chars` set is found.
   276  // that character is excluded from the final buffer.
   277  func (parser *CSVParser) readUntil(chars *byteSet) ([]byte, byte, error) {
   278  	index := IndexAnyByte(parser.buf, chars)
   279  	if index >= 0 {
   280  		ret := parser.buf[:index]
   281  		parser.buf = parser.buf[index:]
   282  		parser.pos += int64(index)
   283  		return ret, parser.buf[0], nil
   284  	}
   285  
   286  	// not found in parser.buf, need allocate and loop.
   287  	var buf []byte
   288  	for {
   289  		buf = append(buf, parser.buf...)
   290  		parser.buf = nil
   291  		if err := parser.readBlock(); err != nil || len(parser.buf) == 0 {
   292  			if err == nil {
   293  				err = io.EOF
   294  			}
   295  			parser.pos += int64(len(buf))
   296  			return buf, 0, errors.Trace(err)
   297  		}
   298  		index := IndexAnyByte(parser.buf, chars)
   299  		if index >= 0 {
   300  			buf = append(buf, parser.buf[:index]...)
   301  			parser.buf = parser.buf[index:]
   302  			parser.pos += int64(len(buf))
   303  			return buf, parser.buf[0], nil
   304  		}
   305  	}
   306  }
   307  
   308  func (parser *CSVParser) readRecord(dst []string) ([]string, error) {
   309  	parser.recordBuffer = parser.recordBuffer[:0]
   310  	parser.fieldIndexes = parser.fieldIndexes[:0]
   311  
   312  	isEmptyLine := true
   313  	whitespaceLine := true
   314  	prevToken := csvTokenNewLine
   315  	var firstToken csvToken
   316  
   317  outside:
   318  	for {
   319  		content, firstByte, err := parser.readUntil(&parser.unquoteByteSet)
   320  
   321  		if len(content) > 0 {
   322  			isEmptyLine = false
   323  			if prevToken == csvTokenDelimiter {
   324  				parser.logSyntaxError()
   325  				return nil, errors.AddStack(errUnexpectedQuoteField)
   326  			}
   327  			parser.recordBuffer = append(parser.recordBuffer, content...)
   328  			prevToken = csvTokenAnyUnquoted
   329  		}
   330  
   331  		if err != nil {
   332  			if isEmptyLine || errors.Cause(err) != io.EOF {
   333  				return nil, err
   334  			}
   335  			// treat EOF as the same as trailing \n.
   336  			firstToken = csvTokenNewLine
   337  		} else {
   338  			parser.skipBytes(1)
   339  			firstToken, err = parser.readUnquoteToken(firstByte)
   340  			if err != nil {
   341  				return nil, err
   342  			}
   343  		}
   344  
   345  		switch firstToken {
   346  		case csvTokenComma:
   347  			whitespaceLine = false
   348  			parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer))
   349  		case csvTokenDelimiter:
   350  			if prevToken != csvTokenComma && prevToken != csvTokenNewLine {
   351  				parser.logSyntaxError()
   352  				return nil, errors.AddStack(errUnexpectedQuoteField)
   353  			}
   354  			if err = parser.readQuotedField(); err != nil {
   355  				return nil, err
   356  			}
   357  			whitespaceLine = false
   358  		case csvTokenNewLine:
   359  			// new line = end of record (ignore empty lines)
   360  			prevToken = firstToken
   361  			if isEmptyLine {
   362  				continue
   363  			}
   364  			// skip lines only contain whitespaces
   365  			if err == nil && whitespaceLine && len(bytes.TrimSpace(parser.recordBuffer)) == 0 {
   366  				parser.recordBuffer = parser.recordBuffer[:0]
   367  				continue
   368  			}
   369  			parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer))
   370  			break outside
   371  		default:
   372  			if prevToken == csvTokenDelimiter {
   373  				parser.logSyntaxError()
   374  				return nil, errors.AddStack(errUnexpectedQuoteField)
   375  			}
   376  			parser.appendCSVTokenToRecordBuffer(firstToken)
   377  		}
   378  		prevToken = firstToken
   379  		isEmptyLine = false
   380  	}
   381  	// Create a single string and create slices out of it.
   382  	// This pins the memory of the fields together, but allocates once.
   383  	str := string(parser.recordBuffer) // Convert to string once to batch allocations
   384  	dst = dst[:0]
   385  	if cap(dst) < len(parser.fieldIndexes) {
   386  		dst = make([]string, len(parser.fieldIndexes))
   387  	}
   388  	dst = dst[:len(parser.fieldIndexes)]
   389  	var preIdx int
   390  	for i, idx := range parser.fieldIndexes {
   391  		dst[i] = str[preIdx:idx]
   392  		preIdx = idx
   393  	}
   394  
   395  	// Check or update the expected fields per record.
   396  	return dst, nil
   397  }
   398  
   399  func (parser *CSVParser) readQuotedField() error {
   400  	for {
   401  		content, terminator, err := parser.readUntil(&parser.quoteByteSet)
   402  		err = parser.replaceEOF(err, errUnterminatedQuotedField)
   403  		if err != nil {
   404  			return err
   405  		}
   406  		parser.recordBuffer = append(parser.recordBuffer, content...)
   407  		parser.skipBytes(1)
   408  
   409  		token, err := parser.readQuotedToken(terminator)
   410  		if err != nil {
   411  			return err
   412  		}
   413  
   414  		switch token {
   415  		case csvTokenDelimiter:
   416  			// encountered '"' -> continue if we're seeing '""'.
   417  			doubledDelimiter, err := parser.tryReadExact(parser.quote)
   418  			if err != nil {
   419  				return err
   420  			}
   421  			if doubledDelimiter {
   422  				// consume the double quotation mark and continue
   423  				parser.recordBuffer = append(parser.recordBuffer, parser.quote...)
   424  			} else {
   425  				// the field is completed, exit.
   426  				return nil
   427  			}
   428  		default:
   429  			parser.appendCSVTokenToRecordBuffer(token)
   430  		}
   431  	}
   432  }
   433  
   434  func (parser *CSVParser) replaceEOF(err error, replaced error) error {
   435  	if err == nil || errors.Cause(err) != io.EOF {
   436  		return err
   437  	}
   438  	if replaced != nil {
   439  		parser.logSyntaxError()
   440  		replaced = errors.AddStack(replaced)
   441  	}
   442  	return replaced
   443  }
   444  
   445  // ReadRow reads a row from the datafile.
   446  func (parser *CSVParser) ReadRow() error {
   447  	row := &parser.lastRow
   448  	row.Length = 0
   449  	row.RowID++
   450  
   451  	// skip the header first
   452  	if parser.shouldParseHeader {
   453  		err := parser.ReadColumns()
   454  		if err != nil {
   455  			return errors.Trace(err)
   456  		}
   457  		parser.shouldParseHeader = false
   458  	}
   459  
   460  	records, err := parser.readRecord(parser.lastRecord)
   461  	if err != nil {
   462  		return errors.Trace(err)
   463  	}
   464  	parser.lastRecord = records
   465  	// remove the last empty value
   466  	if parser.cfg.TrimLastSep {
   467  		i := len(records) - 1
   468  		if i >= 0 && len(records[i]) == 0 {
   469  			records = records[:i]
   470  		}
   471  	}
   472  
   473  	row.Row = parser.acquireDatumSlice()
   474  	if cap(row.Row) >= len(records) {
   475  		row.Row = row.Row[:len(records)]
   476  	} else {
   477  		row.Row = make([]types.Datum, len(records))
   478  	}
   479  	for i, record := range records {
   480  		row.Length += len(record)
   481  		unescaped, isNull := parser.unescapeString(record)
   482  		if isNull {
   483  			row.Row[i].SetNull()
   484  		} else {
   485  			row.Row[i].SetString(unescaped, "utf8mb4_bin")
   486  		}
   487  	}
   488  
   489  	return nil
   490  }
   491  
   492  func (parser *CSVParser) ReadColumns() error {
   493  	columns, err := parser.readRecord(nil)
   494  	if err != nil {
   495  		return errors.Trace(err)
   496  	}
   497  	parser.columns = make([]string, 0, len(columns))
   498  	for _, colName := range columns {
   499  		colName, _ = parser.unescapeString(colName)
   500  		parser.columns = append(parser.columns, strings.ToLower(colName))
   501  	}
   502  	return nil
   503  }
   504  
   505  var newLineASCIISet = makeByteSet([]byte{'\r', '\n'})
   506  
   507  func indexOfNewLine(b []byte) int {
   508  	return IndexAnyByte(b, &newLineASCIISet)
   509  }
   510  
   511  // ReadUntilTerminator seeks the file until the terminator token is found, and
   512  // returns the file offset beyond the terminator.
   513  // This function is used in strict-format dividing a CSV file.
   514  func (parser *CSVParser) ReadUntilTerminator() (int64, error) {
   515  	for {
   516  		_, firstByte, err := parser.readUntil(&parser.newLineByteSet)
   517  		if err != nil {
   518  			return 0, err
   519  		}
   520  		parser.skipBytes(1)
   521  		if ok, err := parser.tryReadNewLine(firstByte); ok || err != nil {
   522  			return parser.pos, err
   523  		}
   524  	}
   525  }