github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/util/csvparser/csv_parser.go

github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/util/csvparser/csv_parser.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package csvparser
    16  
    17  import (
    18  	"bytes"
    19  	"errors"
    20  	"fmt"
    21  	"io"
    22  	"regexp"
    23  	"slices"
    24  	"strings"
    25  	"unicode/utf8"
    26  
    27  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    28  	"github.com/spkg/bom"
    29  )
    30  
    31  var (
    32  	errUnterminatedQuotedField       = moerr.NewInvalidInputNoCtx("csvParser error: unterminated quoted field")
    33  	errDanglingBackslash             = moerr.NewInvalidInputNoCtx("csvParser error: no character after backslash")
    34  	errUnexpectedQuoteField          = moerr.NewInvalidInputNoCtx("csvParser error: cannot have consecutive fields without separator")
    35  	BufferSizeScale                  = int64(5)
    36  	ReadBlockSize              int64 = 64 * 1024
    37  	// LargestEntryLimit is the max size for reading file to buf
    38  	LargestEntryLimit = 10 * 1024 * 1024
    39  )
    40  
    41  type Field struct {
    42  	Val    string
    43  	IsNull bool
    44  }
    45  
    46  type escapeFlavor uint8
    47  
    48  const (
    49  	escapeFlavorNone escapeFlavor = iota
    50  	escapeFlavorMySQL
    51  	escapeFlavorMySQLWithNull
    52  )
    53  
    54  type CSVConfig struct {
    55  	// they can only be used by LOAD DATA
    56  	// https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-field-line-handling
    57  	LinesStartingBy   string
    58  	LinesTerminatedBy string
    59  
    60  	FieldsTerminatedBy string
    61  	FieldsEnclosedBy   string
    62  	FieldsEscapedBy    string
    63  
    64  	Null              []string
    65  	Header            bool
    66  	HeaderSchemaMatch bool
    67  	TrimLastSep       bool
    68  	NotNull           bool
    69  
    70  	AllowEmptyLine bool
    71  	// For non-empty FieldsEnclosedBy (for example quotes), null elements inside quotes are not considered as null except for
    72  	// `\N` (when escape-by is `\`). That is to say, `\N` is special for null because it always means null.
    73  	QuotedNullIsText bool
    74  	// ref https://dev.mysql.com/doc/refman/8.0/en/load-data.html
    75  	// > If the field begins with the ENCLOSED BY character, instances of that character are recognized as terminating a
    76  	// > field value only if followed by the field or line TERMINATED BY sequence.
    77  	// This means we will meet unescaped quote in a quoted field
    78  	// > The "BIG" boss      -> The "BIG" boss
    79  	// This means we will meet unescaped quote in an unquoted field
    80  	UnescapedQuote bool
    81  
    82  	// see csv.Reader
    83  	Comment byte
    84  }
    85  
    86  // CSVParser is basically a copy of encoding/csv, but special-cased for MySQL-like input.
    87  type CSVParser struct {
    88  	cfg *CSVConfig
    89  
    90  	comma          []byte
    91  	quote          []byte
    92  	newLine        []byte
    93  	startingBy     []byte
    94  	escapedBy      string
    95  	unescapeRegexp *regexp.Regexp
    96  
    97  	// These variables are used with IndexAnyByte to search a byte slice for the
    98  	// first index which some special character may appear.
    99  	// quoteByteSet is used inside quoted fields (so the first characters of
   100  	// the closing delimiter and backslash are special).
   101  	// unquoteByteSet is used outside quoted fields (so the first characters
   102  	// of the opening delimiter, separator, terminator and backslash are
   103  	// special).
   104  	// newLineByteSet is used in strict-format CSV dividing (so the first
   105  	// characters of the terminator are special).
   106  	quoteByteSet   byteSet
   107  	unquoteByteSet byteSet
   108  	newLineByteSet byteSet
   109  
   110  	// recordBuffer holds the unescaped fields, one after another.
   111  	// The fields can be accessed by using the indexes in fieldIndexes.
   112  	// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
   113  	// and fieldIndexes will contain the indexes [1, 2, 5, 6].
   114  	recordBuffer []byte
   115  
   116  	// fieldIndexes is an index of fields inside recordBuffer.
   117  	// The width field ends at offset fieldIndexes[i] in recordBuffer.
   118  	fieldIndexes  []int
   119  	fieldIsQuoted []bool
   120  
   121  	lastRecord []field
   122  
   123  	escFlavor escapeFlavor
   124  	// if set to true, csv parser will treat the first non-empty line as header line
   125  	shouldParseHeader bool
   126  	// in LOAD DATA, empty line should be treated as a valid field
   127  	allowEmptyLine   bool
   128  	quotedNullIsText bool
   129  	unescapedQuote   bool
   130  
   131  	reader io.Reader
   132  	// stores data that has NOT been parsed yet, it shares same memory as appendBuf.
   133  	buf []byte
   134  	// used to read data from the reader, the data will be moved to other buffers.
   135  	blockBuf    []byte
   136  	isLastChunk bool
   137  
   138  	// The list of column names of the last INSERT statement.
   139  	columns []string
   140  
   141  	lastRow []Field
   142  
   143  	// the reader position we have parsed, if the underlying reader is not
   144  	// a compressed file, it's the file position we have parsed too.
   145  	// this value may go backward when failed to read quoted field, but it's
   146  	// for printing error message, and the parser should not be used later,
   147  	// so it's ok, see readQuotedField.
   148  	pos int64
   149  
   150  	// cache
   151  	remainBuf *bytes.Buffer
   152  	appendBuf *bytes.Buffer
   153  
   154  	reuseRow bool
   155  
   156  	// see csv.Reader
   157  	comment byte
   158  }
   159  
   160  type field struct {
   161  	content string
   162  	quoted  bool
   163  }
   164  
   165  // NewCSVParser creates a CSV parser.
   166  func NewCSVParser(
   167  	cfg *CSVConfig,
   168  	reader io.Reader,
   169  	blockBufSize int64,
   170  	shouldParseHeader bool,
   171  	reuseRow bool,
   172  ) (*CSVParser, error) {
   173  	// see csv.Reader
   174  	if !validDelim(rune(cfg.FieldsTerminatedBy[0])) || (cfg.Comment != 0 && !validDelim(rune(cfg.Comment))) || cfg.Comment == cfg.FieldsTerminatedBy[0] {
   175  		return nil, moerr.NewInvalidInputNoCtx("invalid field or comment delimiter")
   176  	}
   177  
   178  	var err error
   179  	var separator, delimiter, terminator string
   180  
   181  	separator = cfg.FieldsTerminatedBy
   182  	delimiter = cfg.FieldsEnclosedBy
   183  	terminator = cfg.LinesTerminatedBy
   184  
   185  	if terminator == "\r\n" {
   186  		terminator = "\n"
   187  	}
   188  
   189  	var quoteStopSet, newLineStopSet []byte
   190  	unquoteStopSet := []byte{separator[0]}
   191  	if len(delimiter) > 0 {
   192  		quoteStopSet = []byte{delimiter[0]}
   193  		unquoteStopSet = append(unquoteStopSet, delimiter[0])
   194  	}
   195  	if len(terminator) > 0 {
   196  		newLineStopSet = []byte{terminator[0]}
   197  	} else {
   198  		// The character set encoding of '\r' and '\n' is the same in UTF-8 and GBK.
   199  		newLineStopSet = []byte{'\r', '\n'}
   200  	}
   201  	unquoteStopSet = append(unquoteStopSet, newLineStopSet...)
   202  
   203  	if len(cfg.LinesStartingBy) > 0 {
   204  		if strings.Contains(cfg.LinesStartingBy, terminator) {
   205  			return nil, moerr.NewInvalidInputNoCtx(fmt.Sprintf("STARTING BY '%s' cannot contain LINES TERMINATED BY '%s'", cfg.LinesStartingBy, terminator))
   206  		}
   207  	}
   208  
   209  	escFlavor := escapeFlavorNone
   210  	var r *regexp.Regexp
   211  
   212  	if len(cfg.FieldsEscapedBy) > 0 {
   213  		escFlavor = escapeFlavorMySQL
   214  		quoteStopSet = append(quoteStopSet, cfg.FieldsEscapedBy[0])
   215  		unquoteStopSet = append(unquoteStopSet, cfg.FieldsEscapedBy[0])
   216  		// we need special treatment of the NULL value \N, used by MySQL.
   217  		if !cfg.NotNull && slices.Contains(cfg.Null, cfg.FieldsEscapedBy+`N`) {
   218  			escFlavor = escapeFlavorMySQLWithNull
   219  		}
   220  		r, err = regexp.Compile(`(?s)` + regexp.QuoteMeta(cfg.FieldsEscapedBy) + `.`)
   221  		if err != nil {
   222  			return nil, err
   223  		}
   224  	}
   225  	return &CSVParser{
   226  		reader:            reader,
   227  		blockBuf:          make([]byte, blockBufSize*BufferSizeScale),
   228  		remainBuf:         &bytes.Buffer{},
   229  		appendBuf:         &bytes.Buffer{},
   230  		cfg:               cfg,
   231  		comma:             []byte(separator),
   232  		quote:             []byte(delimiter),
   233  		newLine:           []byte(terminator),
   234  		startingBy:        []byte(cfg.LinesStartingBy),
   235  		escapedBy:         cfg.FieldsEscapedBy,
   236  		unescapeRegexp:    r,
   237  		escFlavor:         escFlavor,
   238  		quoteByteSet:      makeByteSet(quoteStopSet),
   239  		unquoteByteSet:    makeByteSet(unquoteStopSet),
   240  		newLineByteSet:    makeByteSet(newLineStopSet),
   241  		shouldParseHeader: shouldParseHeader,
   242  		allowEmptyLine:    cfg.AllowEmptyLine,
   243  		quotedNullIsText:  cfg.QuotedNullIsText,
   244  		unescapedQuote:    cfg.UnescapedQuote,
   245  		reuseRow:          reuseRow,
   246  	}, nil
   247  }
   248  func (parser *CSVParser) Read() (row []Field, err error) {
   249  	if parser.reuseRow {
   250  		row, err = parser.readRow(parser.lastRow)
   251  		parser.lastRow = row
   252  	} else {
   253  		row, err = parser.readRow(nil)
   254  	}
   255  	return row, err
   256  }
   257  
   258  func (parser *CSVParser) Pos() int64 {
   259  	return parser.pos
   260  }
   261  
   262  func validDelim(r rune) bool {
   263  	return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
   264  }
   265  
   266  // readRow reads a row from the datafile.
   267  func (parser *CSVParser) readRow(row []Field) ([]Field, error) {
   268  	// skip the header first
   269  	if parser.shouldParseHeader {
   270  		err := parser.readColumns()
   271  		if err != nil {
   272  			return nil, err
   273  		}
   274  		parser.shouldParseHeader = false
   275  	}
   276  
   277  	records, err := parser.readRecord(parser.lastRecord)
   278  	if err != nil {
   279  		return nil, err
   280  	}
   281  	parser.lastRecord = records
   282  	// remove the last empty value
   283  	if parser.cfg.TrimLastSep {
   284  		i := len(records) - 1
   285  		if i >= 0 && len(records[i].content) == 0 {
   286  			records = records[:i]
   287  		}
   288  	}
   289  	row = row[:0]
   290  	if cap(row) < len(records) {
   291  		row = make([]Field, len(records))
   292  	}
   293  	row = row[:len(records)]
   294  	for i, record := range records {
   295  		unescaped, isNull, err := parser.unescapeString(record)
   296  		if err != nil {
   297  			return nil, err
   298  		}
   299  		row[i].IsNull = isNull
   300  		row[i].Val = unescaped
   301  	}
   302  
   303  	return row, nil
   304  }
   305  
   306  func (parser *CSVParser) unescapeString(input field) (unescaped string, isNull bool, err error) {
   307  	// Convert the input from another charset to utf8mb4 before we return the string.
   308  	unescaped = input.content
   309  	if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` {
   310  		return input.content, true, nil
   311  	}
   312  	if parser.cfg.FieldsEnclosedBy != "" && !input.quoted && unescaped == "NULL" {
   313  		return input.content, true, nil
   314  	}
   315  	if len(parser.escapedBy) > 0 {
   316  		unescaped = unescape(unescaped, "", parser.escFlavor, parser.escapedBy[0], parser.unescapeRegexp)
   317  	}
   318  	if !(len(parser.quote) > 0 && parser.quotedNullIsText && input.quoted) {
   319  		// this branch represents "quote is not configured" or "quoted null is null" or "this field has no quote"
   320  		// we check null for them
   321  		isNull = !parser.cfg.NotNull &&
   322  			slices.Contains(parser.cfg.Null, unescaped)
   323  		// avoid \\N becomes NULL
   324  		if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` {
   325  			isNull = false
   326  		}
   327  	}
   328  	return
   329  }
   330  
   331  // csvToken is a type representing either a normal byte or some CSV-specific
   332  // tokens such as the separator (comma), delimiter (quote) and terminator (new
   333  // line).
   334  type csvToken int16
   335  
   336  const (
   337  	// csvTokenAnyUnquoted is a placeholder to represent any unquoted character.
   338  	csvTokenAnyUnquoted csvToken = 0
   339  	// csvTokenEscaped is a mask indicating an escaped character.
   340  	// The actual token is represented like `csvTokenEscaped | 'n'`.
   341  	csvTokenEscaped csvToken = 0x100
   342  	// csvTokenComma is the CSV separator token.
   343  	csvTokenComma csvToken = 0x200
   344  	// csvTokenNewLine is the CSV terminator token.
   345  	csvTokenNewLine csvToken = 0x400
   346  	// csvTokenDelimiter is the CSV delimiter token.
   347  	csvTokenDelimiter csvToken = 0x800
   348  )
   349  
   350  func (parser *CSVParser) readByte() (byte, error) {
   351  	if len(parser.buf) == 0 {
   352  		if err := parser.readBlock(); err != nil {
   353  			return 0, err
   354  		}
   355  	}
   356  	if len(parser.buf) == 0 {
   357  		return 0, io.EOF
   358  	}
   359  	b := parser.buf[0]
   360  	parser.buf = parser.buf[1:]
   361  	parser.pos++
   362  	return b, nil
   363  }
   364  
   365  func (parser *CSVParser) peekBytes(cnt int) ([]byte, error) {
   366  	if len(parser.buf) < cnt {
   367  		if err := parser.readBlock(); err != nil {
   368  			return nil, err
   369  		}
   370  	}
   371  	if len(parser.buf) == 0 {
   372  		return nil, io.EOF
   373  	}
   374  	if len(parser.buf) < cnt {
   375  		cnt = len(parser.buf)
   376  	}
   377  	return parser.buf[:cnt], nil
   378  }
   379  
   380  func (parser *CSVParser) skipBytes(n int) {
   381  	parser.buf = parser.buf[n:]
   382  	parser.pos += int64(n)
   383  }
   384  
   385  // tryPeekExact peeks the bytes ahead, and if it matches `content` exactly will
   386  // return (true, false, nil). If meet EOF it will return (false, true, nil).
   387  // For other errors it will return (false, false, err).
   388  func (parser *CSVParser) tryPeekExact(content []byte) (matched bool, eof bool, err error) {
   389  	if len(content) == 0 {
   390  		return true, false, nil
   391  	}
   392  	bs, err := parser.peekBytes(len(content))
   393  	if err == nil {
   394  		if bytes.Equal(bs, content) {
   395  			return true, false, nil
   396  		}
   397  	} else if err == io.EOF {
   398  		return false, true, nil
   399  	}
   400  	return false, false, err
   401  }
   402  
   403  // tryReadExact peeks the bytes ahead, and if it matches `content` exactly will
   404  // consume it (advance the cursor) and return `true`.
   405  func (parser *CSVParser) tryReadExact(content []byte) (bool, error) {
   406  	matched, _, err := parser.tryPeekExact(content)
   407  	if matched {
   408  		parser.skipBytes(len(content))
   409  	}
   410  	return matched, err
   411  }
   412  
   413  func (parser *CSVParser) tryReadNewLine(b byte) (bool, error) {
   414  	if len(parser.newLine) == 0 {
   415  		return b == '\r' || b == '\n', nil
   416  	}
   417  	if b != parser.newLine[0] {
   418  		return false, nil
   419  	}
   420  	return parser.tryReadExact(parser.newLine[1:])
   421  }
   422  
   423  func (parser *CSVParser) tryReadOpenDelimiter(b byte) (bool, error) {
   424  	if len(parser.quote) == 0 || parser.quote[0] != b {
   425  		return false, nil
   426  	}
   427  	return parser.tryReadExact(parser.quote[1:])
   428  }
   429  
   430  // tryReadCloseDelimiter is currently equivalent to tryReadOpenDelimiter until
   431  // we support asymmetric delimiters.
   432  func (parser *CSVParser) tryReadCloseDelimiter(b byte) (bool, error) {
   433  	if parser.quote[0] != b {
   434  		return false, nil
   435  	}
   436  	return parser.tryReadExact(parser.quote[1:])
   437  }
   438  
   439  func (parser *CSVParser) tryReadComma(b byte) (bool, error) {
   440  	if parser.comma[0] != b {
   441  		return false, nil
   442  	}
   443  	return parser.tryReadExact(parser.comma[1:])
   444  }
   445  
   446  func (parser *CSVParser) tryReadEscaped(bs byte) (bool, byte, error) {
   447  	if parser.escapedBy == "" {
   448  		return false, 0, nil
   449  	}
   450  	if bs != parser.escapedBy[0] || parser.escFlavor == escapeFlavorNone {
   451  		return false, 0, nil
   452  	}
   453  	b, err := parser.readByte()
   454  	return true, b, parser.replaceEOF(err, errDanglingBackslash)
   455  }
   456  
   457  // readQuoteToken reads a token inside quoted fields.
   458  func (parser *CSVParser) readQuotedToken(b byte) (csvToken, error) {
   459  	if ok, err := parser.tryReadCloseDelimiter(b); ok || err != nil {
   460  		return csvTokenDelimiter, err
   461  	}
   462  	if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil {
   463  		return csvTokenEscaped | csvToken(eb), err
   464  	}
   465  	return csvToken(b), nil
   466  }
   467  
   468  // readUnquoteToken reads a token outside quoted fields.
   469  func (parser *CSVParser) readUnquoteToken(b byte) (csvToken, error) {
   470  	if ok, err := parser.tryReadNewLine(b); ok || err != nil {
   471  		return csvTokenNewLine, err
   472  	}
   473  	if ok, err := parser.tryReadComma(b); ok || err != nil {
   474  		return csvTokenComma, err
   475  	}
   476  	if ok, err := parser.tryReadOpenDelimiter(b); ok || err != nil {
   477  		return csvTokenDelimiter, err
   478  	}
   479  	if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil {
   480  		return csvTokenEscaped | csvToken(eb), err
   481  	}
   482  	return csvToken(b), nil
   483  }
   484  
   485  func (parser *CSVParser) appendCSVTokenToRecordBuffer(token csvToken) {
   486  	if token&csvTokenEscaped != 0 {
   487  		parser.recordBuffer = append(parser.recordBuffer, parser.escapedBy[0])
   488  	}
   489  	parser.recordBuffer = append(parser.recordBuffer, byte(token))
   490  }
   491  
   492  // readUntil reads the buffer until any character from the `chars` set is found.
   493  // that character is excluded from the final buffer.
   494  func (parser *CSVParser) readUntil(chars *byteSet) ([]byte, byte, error) {
   495  	index := IndexAnyByte(parser.buf, chars)
   496  	if index >= 0 {
   497  		ret := parser.buf[:index]
   498  		parser.buf = parser.buf[index:]
   499  		parser.pos += int64(index)
   500  		return ret, parser.buf[0], nil
   501  	}
   502  
   503  	// not found in parser.buf, need allocate and loop.
   504  	var buf []byte
   505  	for {
   506  		buf = append(buf, parser.buf...)
   507  		if len(buf) > LargestEntryLimit {
   508  			return buf, 0, moerr.NewInternalErrorNoCtx("size of row cannot exceed the max value of txn-entry-size-limit")
   509  		}
   510  		parser.buf = nil
   511  		if err := parser.readBlock(); err != nil || len(parser.buf) == 0 {
   512  			if err == nil {
   513  				err = io.EOF
   514  			}
   515  			parser.pos += int64(len(buf))
   516  			return buf, 0, err
   517  		}
   518  		index := IndexAnyByte(parser.buf, chars)
   519  		if index >= 0 {
   520  			buf = append(buf, parser.buf[:index]...)
   521  			parser.buf = parser.buf[index:]
   522  			parser.pos += int64(len(buf))
   523  			return buf, parser.buf[0], nil
   524  		}
   525  	}
   526  }
   527  
   528  func (parser *CSVParser) readRecord(dst []field) ([]field, error) {
   529  	parser.recordBuffer = parser.recordBuffer[:0]
   530  	parser.fieldIndexes = parser.fieldIndexes[:0]
   531  	parser.fieldIsQuoted = parser.fieldIsQuoted[:0]
   532  
   533  	isEmptyLine := true
   534  	whitespaceLine := true
   535  	foundStartingByThisLine := false
   536  	prevToken := csvTokenNewLine
   537  	fieldIsQuoted := false
   538  	var firstToken csvToken
   539  
   540  outside:
   541  	for {
   542  		// we should drop
   543  		// 1. the whole line if it does not contain startingBy
   544  		// 2. any character before startingBy
   545  		// since we have checked startingBy does not contain terminator, we can
   546  		// split at terminator to check the substring contains startingBy. Even
   547  		// if the terminator is inside a quoted field which means it's not the
   548  		// end of a line, the substring can still be dropped by rule 2.
   549  		if len(parser.startingBy) > 0 && !foundStartingByThisLine {
   550  			oldPos := parser.pos
   551  			content, _, err := parser.readUntilTerminator()
   552  			if err != nil {
   553  				if len(content) == 0 {
   554  					return nil, err
   555  				}
   556  				// if we reached EOF, we should still check the content contains
   557  				// startingBy and try to put back and parse it.
   558  			}
   559  			idx := bytes.Index(content, parser.startingBy)
   560  			if idx == -1 {
   561  				continue
   562  			}
   563  			foundStartingByThisLine = true
   564  			content = content[idx+len(parser.startingBy):]
   565  			parser.buf = append(content, parser.buf...)
   566  			parser.pos = oldPos + int64(idx+len(parser.startingBy))
   567  		}
   568  
   569  		content, firstByte, err := parser.readUntil(&parser.unquoteByteSet)
   570  
   571  		if len(content) > 0 {
   572  			isEmptyLine = false
   573  			if prevToken == csvTokenDelimiter {
   574  				return nil, errUnexpectedQuoteField
   575  			}
   576  			parser.recordBuffer = append(parser.recordBuffer, content...)
   577  			prevToken = csvTokenAnyUnquoted
   578  		}
   579  
   580  		if err != nil {
   581  			if isEmptyLine || err != io.EOF {
   582  				return nil, err
   583  			}
   584  			// treat EOF as the same as trailing \n.
   585  			firstToken = csvTokenNewLine
   586  		} else {
   587  			parser.skipBytes(1)
   588  			firstToken, err = parser.readUnquoteToken(firstByte)
   589  			if err != nil {
   590  				return nil, err
   591  			}
   592  		}
   593  
   594  		switch firstToken {
   595  		case csvTokenComma:
   596  			whitespaceLine = false
   597  			parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer))
   598  			parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted)
   599  			fieldIsQuoted = false
   600  		case csvTokenDelimiter:
   601  			if prevToken != csvTokenComma && prevToken != csvTokenNewLine {
   602  				if parser.unescapedQuote {
   603  					whitespaceLine = false
   604  					parser.recordBuffer = append(parser.recordBuffer, parser.quote...)
   605  					continue
   606  				}
   607  				return nil, errUnexpectedQuoteField
   608  			}
   609  			if err = parser.readQuotedField(); err != nil {
   610  				return nil, err
   611  			}
   612  			fieldIsQuoted = true
   613  			whitespaceLine = false
   614  		case csvTokenNewLine:
   615  			foundStartingByThisLine = false
   616  			// new line = end of field (ignore empty lines)
   617  			prevToken = firstToken
   618  			if !parser.allowEmptyLine {
   619  				if isEmptyLine {
   620  					continue
   621  				}
   622  				// skip lines only contain whitespaces
   623  				if err == nil && whitespaceLine && len(bytes.TrimSpace(parser.recordBuffer)) == 0 {
   624  					parser.recordBuffer = parser.recordBuffer[:0]
   625  					continue
   626  				}
   627  			}
   628  			// skip lines start with comment
   629  			if err == nil && parser.comment != 0 && parser.recordBuffer[0] == parser.comment {
   630  				parser.recordBuffer = parser.recordBuffer[:0]
   631  				parser.fieldIndexes = parser.fieldIndexes[:0]
   632  				parser.fieldIsQuoted = parser.fieldIsQuoted[:0]
   633  
   634  				isEmptyLine = true
   635  				whitespaceLine = true
   636  				foundStartingByThisLine = false
   637  				prevToken = csvTokenNewLine
   638  				fieldIsQuoted = false
   639  				continue
   640  			}
   641  			if bytes.Equal(parser.newLine, []byte{'\n'}) {
   642  				if n := len(parser.recordBuffer); n > 1 && parser.recordBuffer[n-1] == '\r' {
   643  					parser.recordBuffer = parser.recordBuffer[:n-1]
   644  				}
   645  			}
   646  			parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer))
   647  			parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted)
   648  			// the loop is end, no need to reset fieldIsQuoted
   649  			break outside
   650  		default:
   651  			if prevToken == csvTokenDelimiter {
   652  				return nil, errUnexpectedQuoteField
   653  			}
   654  			parser.appendCSVTokenToRecordBuffer(firstToken)
   655  		}
   656  		prevToken = firstToken
   657  		isEmptyLine = false
   658  	}
   659  	// Create a single string and create slices out of it.
   660  	// This pins the memory of the fields together, but allocates once.
   661  	str := string(parser.recordBuffer) // Convert to string once to batch allocations
   662  	dst = dst[:0]
   663  	if cap(dst) < len(parser.fieldIndexes) {
   664  		dst = make([]field, len(parser.fieldIndexes))
   665  	}
   666  	dst = dst[:len(parser.fieldIndexes)]
   667  	var preIdx int
   668  	for i, idx := range parser.fieldIndexes {
   669  		dst[i].content = str[preIdx:idx]
   670  		dst[i].quoted = parser.fieldIsQuoted[i]
   671  		preIdx = idx
   672  	}
   673  
   674  	// Check or update the expected fields per field.
   675  	return dst, nil
   676  }
   677  
   678  func (parser *CSVParser) readQuotedField() error {
   679  	for {
   680  		prevPos := parser.pos
   681  		content, terminator, err := parser.readUntil(&parser.quoteByteSet)
   682  		if err != nil {
   683  			if err == io.EOF {
   684  				// return the position of quote to the caller.
   685  				// because we return an error here, the parser won't
   686  				// use the `pos` again, so it's safe to modify it here.
   687  				parser.pos = prevPos - 1
   688  				// set buf to parser.buf in order to print err log
   689  				parser.buf = content
   690  				err = parser.replaceEOF(err, errUnterminatedQuotedField)
   691  			}
   692  			return err
   693  		}
   694  		parser.recordBuffer = append(parser.recordBuffer, content...)
   695  		parser.skipBytes(1)
   696  
   697  		token, err := parser.readQuotedToken(terminator)
   698  		if err != nil {
   699  			return err
   700  		}
   701  
   702  		switch token {
   703  		case csvTokenDelimiter:
   704  			// encountered '"' -> continue if we're seeing '""'.
   705  			doubledDelimiter, err := parser.tryReadExact(parser.quote)
   706  			if err != nil {
   707  				return err
   708  			}
   709  			if doubledDelimiter {
   710  				// consume the double quotation mark and continue
   711  				parser.recordBuffer = append(parser.recordBuffer, parser.quote...)
   712  			} else if parser.unescapedQuote {
   713  				// allow unescaped quote inside quoted field, so we only finish
   714  				// reading the field when we see a delimiter + comma/newline.
   715  				comma, _, err2 := parser.tryPeekExact(parser.comma)
   716  				if comma || err2 != nil {
   717  					return err2
   718  				}
   719  				newline, eof, err2 := parser.tryPeekExact(parser.newLine)
   720  				if eof || newline {
   721  					return nil
   722  				}
   723  				if err2 != nil {
   724  					return err2
   725  				}
   726  				parser.recordBuffer = append(parser.recordBuffer, parser.quote...)
   727  			} else {
   728  				// the field is completed, exit.
   729  				return nil
   730  			}
   731  		default:
   732  			parser.appendCSVTokenToRecordBuffer(token)
   733  		}
   734  	}
   735  }
   736  
   737  func (parser *CSVParser) replaceEOF(err error, replaced error) error {
   738  	if err == nil || err != io.EOF {
   739  		return err
   740  	}
   741  	return replaced
   742  }
   743  
   744  // readColumns reads the columns of this CSV file.
   745  func (parser *CSVParser) readColumns() error {
   746  	columns, err := parser.readRecord(nil)
   747  	if err != nil {
   748  		return err
   749  	}
   750  	if !parser.cfg.HeaderSchemaMatch {
   751  		return nil
   752  	}
   753  	parser.columns = make([]string, 0, len(columns))
   754  	for _, colName := range columns {
   755  		colNameStr, _, err := parser.unescapeString(colName)
   756  		if err != nil {
   757  			return err
   758  		}
   759  		parser.columns = append(parser.columns, strings.ToLower(colNameStr))
   760  	}
   761  	return nil
   762  }
   763  
   764  // readUntilTerminator seeks the file until the terminator token is found, and
   765  // returns
   766  // - the content with terminator, or the content read before meet error
   767  // - the file offset beyond the terminator, or the offset when meet error
   768  // - error
   769  // Note that the terminator string pattern may be the content of a field, which
   770  // means it's inside quotes. Caller should make sure to handle this case.
   771  func (parser *CSVParser) readUntilTerminator() ([]byte, int64, error) {
   772  	var ret []byte
   773  	for {
   774  		content, firstByte, err := parser.readUntil(&parser.newLineByteSet)
   775  		ret = append(ret, content...)
   776  		if err != nil {
   777  			return ret, parser.pos, err
   778  		}
   779  		parser.skipBytes(1)
   780  		ret = append(ret, firstByte)
   781  		if ok, err := parser.tryReadNewLine(firstByte); ok || err != nil {
   782  			if len(parser.newLine) >= 1 {
   783  				ret = append(ret, parser.newLine[1:]...)
   784  			}
   785  			return ret, parser.pos, err
   786  		}
   787  	}
   788  }
   789  
   790  func (parser *CSVParser) readBlock() error {
   791  	n, err := io.ReadFull(parser.reader, parser.blockBuf)
   792  
   793  	switch {
   794  	case errors.Is(err, io.ErrUnexpectedEOF), err == io.EOF:
   795  		parser.isLastChunk = true
   796  		fallthrough
   797  	case err == nil:
   798  		// `parser.buf` reference to `appendBuf.Bytes`, so should use remainBuf to
   799  		// hold the `parser.buf` rest data to prevent slice overlap
   800  		parser.remainBuf.Reset()
   801  		parser.remainBuf.Write(parser.buf)
   802  		parser.appendBuf.Reset()
   803  		parser.appendBuf.Write(parser.remainBuf.Bytes())
   804  		blockData := parser.blockBuf[:n]
   805  		if parser.pos == 0 {
   806  			bomCleanedData := bom.Clean(blockData)
   807  			parser.pos += int64(n - len(bomCleanedData))
   808  			blockData = bomCleanedData
   809  		}
   810  		parser.appendBuf.Write(blockData)
   811  		parser.buf = parser.appendBuf.Bytes()
   812  		return nil
   813  	default:
   814  		return err
   815  	}
   816  }
   817  
   818  func (parser *CSVParser) Columns() []string {
   819  	return parser.columns
   820  }
   821  
   822  func (parser *CSVParser) SetColumns(columns []string) {
   823  	parser.columns = columns
   824  }
   825  
   826  func unescape(
   827  	input string,
   828  	delim string,
   829  	escFlavor escapeFlavor,
   830  	escChar byte,
   831  	unescapeRegexp *regexp.Regexp,
   832  ) string {
   833  	if len(delim) > 0 {
   834  		delim2 := delim + delim
   835  		if strings.Contains(input, delim2) {
   836  			input = strings.ReplaceAll(input, delim2, delim)
   837  		}
   838  	}
   839  	if escFlavor != escapeFlavorNone && strings.IndexByte(input, escChar) != -1 {
   840  		input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string {
   841  			switch substr[1] {
   842  			case '0':
   843  				return "\x00"
   844  			case 'b':
   845  				return "\b"
   846  			case 'n':
   847  				return "\n"
   848  			case 'r':
   849  				return "\r"
   850  			case 't':
   851  				return "\t"
   852  			case 'Z':
   853  				return "\x1a"
   854  			default:
   855  				return substr[1:]
   856  			}
   857  		})
   858  	}
   859  	return input
   860  }
   861  
   862  // Copyright 2009 The Go Authors. All rights reserved.
   863  // Use of this source code is governed by a BSD-style
   864  // license that can be found in the LICENSE file.
   865  
   866  // Package bytes implements functions for the manipulation of byte slices.
   867  // It is analogous to the facilities of the strings package.
   868  
   869  // this part is copy from `bytes/bytes.go`
   870  
   871  // byteSet is a 32-byte value, where each bit represents the presence of a
   872  // given byte value in the set.
   873  type byteSet [8]uint32
   874  
   875  // makeByteSet creates a set of byte value.
   876  func makeByteSet(chars []byte) (as byteSet) {
   877  	for i := 0; i < len(chars); i++ {
   878  		c := chars[i]
   879  		as[c>>5] |= 1 << uint(c&31)
   880  	}
   881  	return as
   882  }
   883  
   884  // contains reports whether c is inside the set.
   885  func (as *byteSet) contains(c byte) bool {
   886  	return (as[c>>5] & (1 << uint(c&31))) != 0
   887  }
   888  
   889  // IndexAnyByte returns the byte index of the first occurrence in s of any in the byte
   890  // points in chars. It returns -1 if  there is no code point in common.
   891  func IndexAnyByte(s []byte, as *byteSet) int {
   892  	for i, c := range s {
   893  		if as.contains(c) {
   894  			return i
   895  		}
   896  	}
   897  	return -1
   898  }