github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/util/csvparser/csv_parser.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package csvparser 16 17 import ( 18 "bytes" 19 "errors" 20 "fmt" 21 "io" 22 "regexp" 23 "slices" 24 "strings" 25 "unicode/utf8" 26 27 "github.com/matrixorigin/matrixone/pkg/common/moerr" 28 "github.com/spkg/bom" 29 ) 30 31 var ( 32 errUnterminatedQuotedField = moerr.NewInvalidInputNoCtx("csvParser error: unterminated quoted field") 33 errDanglingBackslash = moerr.NewInvalidInputNoCtx("csvParser error: no character after backslash") 34 errUnexpectedQuoteField = moerr.NewInvalidInputNoCtx("csvParser error: cannot have consecutive fields without separator") 35 BufferSizeScale = int64(5) 36 ReadBlockSize int64 = 64 * 1024 37 // LargestEntryLimit is the max size for reading file to buf 38 LargestEntryLimit = 10 * 1024 * 1024 39 ) 40 41 type Field struct { 42 Val string 43 IsNull bool 44 } 45 46 type escapeFlavor uint8 47 48 const ( 49 escapeFlavorNone escapeFlavor = iota 50 escapeFlavorMySQL 51 escapeFlavorMySQLWithNull 52 ) 53 54 type CSVConfig struct { 55 // they can only be used by LOAD DATA 56 // https://dev.mysql.com/doc/refman/8.0/en/load-data.html#load-data-field-line-handling 57 LinesStartingBy string 58 LinesTerminatedBy string 59 60 FieldsTerminatedBy string 61 FieldsEnclosedBy string 62 FieldsEscapedBy string 63 64 Null []string 65 Header bool 66 HeaderSchemaMatch bool 67 TrimLastSep bool 68 NotNull bool 69 70 AllowEmptyLine bool 71 // For non-empty FieldsEnclosedBy (for example quotes), null elements inside quotes are not considered as null except for 72 // `\N` (when escape-by is `\`). That is to say, `\N` is special for null because it always means null. 73 QuotedNullIsText bool 74 // ref https://dev.mysql.com/doc/refman/8.0/en/load-data.html 75 // > If the field begins with the ENCLOSED BY character, instances of that character are recognized as terminating a 76 // > field value only if followed by the field or line TERMINATED BY sequence. 77 // This means we will meet unescaped quote in a quoted field 78 // > The "BIG" boss -> The "BIG" boss 79 // This means we will meet unescaped quote in an unquoted field 80 UnescapedQuote bool 81 82 // see csv.Reader 83 Comment byte 84 } 85 86 // CSVParser is basically a copy of encoding/csv, but special-cased for MySQL-like input. 87 type CSVParser struct { 88 cfg *CSVConfig 89 90 comma []byte 91 quote []byte 92 newLine []byte 93 startingBy []byte 94 escapedBy string 95 unescapeRegexp *regexp.Regexp 96 97 // These variables are used with IndexAnyByte to search a byte slice for the 98 // first index which some special character may appear. 99 // quoteByteSet is used inside quoted fields (so the first characters of 100 // the closing delimiter and backslash are special). 101 // unquoteByteSet is used outside quoted fields (so the first characters 102 // of the opening delimiter, separator, terminator and backslash are 103 // special). 104 // newLineByteSet is used in strict-format CSV dividing (so the first 105 // characters of the terminator are special). 106 quoteByteSet byteSet 107 unquoteByteSet byteSet 108 newLineByteSet byteSet 109 110 // recordBuffer holds the unescaped fields, one after another. 111 // The fields can be accessed by using the indexes in fieldIndexes. 112 // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` 113 // and fieldIndexes will contain the indexes [1, 2, 5, 6]. 114 recordBuffer []byte 115 116 // fieldIndexes is an index of fields inside recordBuffer. 117 // The width field ends at offset fieldIndexes[i] in recordBuffer. 118 fieldIndexes []int 119 fieldIsQuoted []bool 120 121 lastRecord []field 122 123 escFlavor escapeFlavor 124 // if set to true, csv parser will treat the first non-empty line as header line 125 shouldParseHeader bool 126 // in LOAD DATA, empty line should be treated as a valid field 127 allowEmptyLine bool 128 quotedNullIsText bool 129 unescapedQuote bool 130 131 reader io.Reader 132 // stores data that has NOT been parsed yet, it shares same memory as appendBuf. 133 buf []byte 134 // used to read data from the reader, the data will be moved to other buffers. 135 blockBuf []byte 136 isLastChunk bool 137 138 // The list of column names of the last INSERT statement. 139 columns []string 140 141 lastRow []Field 142 143 // the reader position we have parsed, if the underlying reader is not 144 // a compressed file, it's the file position we have parsed too. 145 // this value may go backward when failed to read quoted field, but it's 146 // for printing error message, and the parser should not be used later, 147 // so it's ok, see readQuotedField. 148 pos int64 149 150 // cache 151 remainBuf *bytes.Buffer 152 appendBuf *bytes.Buffer 153 154 reuseRow bool 155 156 // see csv.Reader 157 comment byte 158 } 159 160 type field struct { 161 content string 162 quoted bool 163 } 164 165 // NewCSVParser creates a CSV parser. 166 func NewCSVParser( 167 cfg *CSVConfig, 168 reader io.Reader, 169 blockBufSize int64, 170 shouldParseHeader bool, 171 reuseRow bool, 172 ) (*CSVParser, error) { 173 // see csv.Reader 174 if !validDelim(rune(cfg.FieldsTerminatedBy[0])) || (cfg.Comment != 0 && !validDelim(rune(cfg.Comment))) || cfg.Comment == cfg.FieldsTerminatedBy[0] { 175 return nil, moerr.NewInvalidInputNoCtx("invalid field or comment delimiter") 176 } 177 178 var err error 179 var separator, delimiter, terminator string 180 181 separator = cfg.FieldsTerminatedBy 182 delimiter = cfg.FieldsEnclosedBy 183 terminator = cfg.LinesTerminatedBy 184 185 if terminator == "\r\n" { 186 terminator = "\n" 187 } 188 189 var quoteStopSet, newLineStopSet []byte 190 unquoteStopSet := []byte{separator[0]} 191 if len(delimiter) > 0 { 192 quoteStopSet = []byte{delimiter[0]} 193 unquoteStopSet = append(unquoteStopSet, delimiter[0]) 194 } 195 if len(terminator) > 0 { 196 newLineStopSet = []byte{terminator[0]} 197 } else { 198 // The character set encoding of '\r' and '\n' is the same in UTF-8 and GBK. 199 newLineStopSet = []byte{'\r', '\n'} 200 } 201 unquoteStopSet = append(unquoteStopSet, newLineStopSet...) 202 203 if len(cfg.LinesStartingBy) > 0 { 204 if strings.Contains(cfg.LinesStartingBy, terminator) { 205 return nil, moerr.NewInvalidInputNoCtx(fmt.Sprintf("STARTING BY '%s' cannot contain LINES TERMINATED BY '%s'", cfg.LinesStartingBy, terminator)) 206 } 207 } 208 209 escFlavor := escapeFlavorNone 210 var r *regexp.Regexp 211 212 if len(cfg.FieldsEscapedBy) > 0 { 213 escFlavor = escapeFlavorMySQL 214 quoteStopSet = append(quoteStopSet, cfg.FieldsEscapedBy[0]) 215 unquoteStopSet = append(unquoteStopSet, cfg.FieldsEscapedBy[0]) 216 // we need special treatment of the NULL value \N, used by MySQL. 217 if !cfg.NotNull && slices.Contains(cfg.Null, cfg.FieldsEscapedBy+`N`) { 218 escFlavor = escapeFlavorMySQLWithNull 219 } 220 r, err = regexp.Compile(`(?s)` + regexp.QuoteMeta(cfg.FieldsEscapedBy) + `.`) 221 if err != nil { 222 return nil, err 223 } 224 } 225 return &CSVParser{ 226 reader: reader, 227 blockBuf: make([]byte, blockBufSize*BufferSizeScale), 228 remainBuf: &bytes.Buffer{}, 229 appendBuf: &bytes.Buffer{}, 230 cfg: cfg, 231 comma: []byte(separator), 232 quote: []byte(delimiter), 233 newLine: []byte(terminator), 234 startingBy: []byte(cfg.LinesStartingBy), 235 escapedBy: cfg.FieldsEscapedBy, 236 unescapeRegexp: r, 237 escFlavor: escFlavor, 238 quoteByteSet: makeByteSet(quoteStopSet), 239 unquoteByteSet: makeByteSet(unquoteStopSet), 240 newLineByteSet: makeByteSet(newLineStopSet), 241 shouldParseHeader: shouldParseHeader, 242 allowEmptyLine: cfg.AllowEmptyLine, 243 quotedNullIsText: cfg.QuotedNullIsText, 244 unescapedQuote: cfg.UnescapedQuote, 245 reuseRow: reuseRow, 246 }, nil 247 } 248 func (parser *CSVParser) Read() (row []Field, err error) { 249 if parser.reuseRow { 250 row, err = parser.readRow(parser.lastRow) 251 parser.lastRow = row 252 } else { 253 row, err = parser.readRow(nil) 254 } 255 return row, err 256 } 257 258 func (parser *CSVParser) Pos() int64 { 259 return parser.pos 260 } 261 262 func validDelim(r rune) bool { 263 return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError 264 } 265 266 // readRow reads a row from the datafile. 267 func (parser *CSVParser) readRow(row []Field) ([]Field, error) { 268 // skip the header first 269 if parser.shouldParseHeader { 270 err := parser.readColumns() 271 if err != nil { 272 return nil, err 273 } 274 parser.shouldParseHeader = false 275 } 276 277 records, err := parser.readRecord(parser.lastRecord) 278 if err != nil { 279 return nil, err 280 } 281 parser.lastRecord = records 282 // remove the last empty value 283 if parser.cfg.TrimLastSep { 284 i := len(records) - 1 285 if i >= 0 && len(records[i].content) == 0 { 286 records = records[:i] 287 } 288 } 289 row = row[:0] 290 if cap(row) < len(records) { 291 row = make([]Field, len(records)) 292 } 293 row = row[:len(records)] 294 for i, record := range records { 295 unescaped, isNull, err := parser.unescapeString(record) 296 if err != nil { 297 return nil, err 298 } 299 row[i].IsNull = isNull 300 row[i].Val = unescaped 301 } 302 303 return row, nil 304 } 305 306 func (parser *CSVParser) unescapeString(input field) (unescaped string, isNull bool, err error) { 307 // Convert the input from another charset to utf8mb4 before we return the string. 308 unescaped = input.content 309 if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` { 310 return input.content, true, nil 311 } 312 if parser.cfg.FieldsEnclosedBy != "" && !input.quoted && unescaped == "NULL" { 313 return input.content, true, nil 314 } 315 if len(parser.escapedBy) > 0 { 316 unescaped = unescape(unescaped, "", parser.escFlavor, parser.escapedBy[0], parser.unescapeRegexp) 317 } 318 if !(len(parser.quote) > 0 && parser.quotedNullIsText && input.quoted) { 319 // this branch represents "quote is not configured" or "quoted null is null" or "this field has no quote" 320 // we check null for them 321 isNull = !parser.cfg.NotNull && 322 slices.Contains(parser.cfg.Null, unescaped) 323 // avoid \\N becomes NULL 324 if parser.escFlavor == escapeFlavorMySQLWithNull && unescaped == parser.escapedBy+`N` { 325 isNull = false 326 } 327 } 328 return 329 } 330 331 // csvToken is a type representing either a normal byte or some CSV-specific 332 // tokens such as the separator (comma), delimiter (quote) and terminator (new 333 // line). 334 type csvToken int16 335 336 const ( 337 // csvTokenAnyUnquoted is a placeholder to represent any unquoted character. 338 csvTokenAnyUnquoted csvToken = 0 339 // csvTokenEscaped is a mask indicating an escaped character. 340 // The actual token is represented like `csvTokenEscaped | 'n'`. 341 csvTokenEscaped csvToken = 0x100 342 // csvTokenComma is the CSV separator token. 343 csvTokenComma csvToken = 0x200 344 // csvTokenNewLine is the CSV terminator token. 345 csvTokenNewLine csvToken = 0x400 346 // csvTokenDelimiter is the CSV delimiter token. 347 csvTokenDelimiter csvToken = 0x800 348 ) 349 350 func (parser *CSVParser) readByte() (byte, error) { 351 if len(parser.buf) == 0 { 352 if err := parser.readBlock(); err != nil { 353 return 0, err 354 } 355 } 356 if len(parser.buf) == 0 { 357 return 0, io.EOF 358 } 359 b := parser.buf[0] 360 parser.buf = parser.buf[1:] 361 parser.pos++ 362 return b, nil 363 } 364 365 func (parser *CSVParser) peekBytes(cnt int) ([]byte, error) { 366 if len(parser.buf) < cnt { 367 if err := parser.readBlock(); err != nil { 368 return nil, err 369 } 370 } 371 if len(parser.buf) == 0 { 372 return nil, io.EOF 373 } 374 if len(parser.buf) < cnt { 375 cnt = len(parser.buf) 376 } 377 return parser.buf[:cnt], nil 378 } 379 380 func (parser *CSVParser) skipBytes(n int) { 381 parser.buf = parser.buf[n:] 382 parser.pos += int64(n) 383 } 384 385 // tryPeekExact peeks the bytes ahead, and if it matches `content` exactly will 386 // return (true, false, nil). If meet EOF it will return (false, true, nil). 387 // For other errors it will return (false, false, err). 388 func (parser *CSVParser) tryPeekExact(content []byte) (matched bool, eof bool, err error) { 389 if len(content) == 0 { 390 return true, false, nil 391 } 392 bs, err := parser.peekBytes(len(content)) 393 if err == nil { 394 if bytes.Equal(bs, content) { 395 return true, false, nil 396 } 397 } else if err == io.EOF { 398 return false, true, nil 399 } 400 return false, false, err 401 } 402 403 // tryReadExact peeks the bytes ahead, and if it matches `content` exactly will 404 // consume it (advance the cursor) and return `true`. 405 func (parser *CSVParser) tryReadExact(content []byte) (bool, error) { 406 matched, _, err := parser.tryPeekExact(content) 407 if matched { 408 parser.skipBytes(len(content)) 409 } 410 return matched, err 411 } 412 413 func (parser *CSVParser) tryReadNewLine(b byte) (bool, error) { 414 if len(parser.newLine) == 0 { 415 return b == '\r' || b == '\n', nil 416 } 417 if b != parser.newLine[0] { 418 return false, nil 419 } 420 return parser.tryReadExact(parser.newLine[1:]) 421 } 422 423 func (parser *CSVParser) tryReadOpenDelimiter(b byte) (bool, error) { 424 if len(parser.quote) == 0 || parser.quote[0] != b { 425 return false, nil 426 } 427 return parser.tryReadExact(parser.quote[1:]) 428 } 429 430 // tryReadCloseDelimiter is currently equivalent to tryReadOpenDelimiter until 431 // we support asymmetric delimiters. 432 func (parser *CSVParser) tryReadCloseDelimiter(b byte) (bool, error) { 433 if parser.quote[0] != b { 434 return false, nil 435 } 436 return parser.tryReadExact(parser.quote[1:]) 437 } 438 439 func (parser *CSVParser) tryReadComma(b byte) (bool, error) { 440 if parser.comma[0] != b { 441 return false, nil 442 } 443 return parser.tryReadExact(parser.comma[1:]) 444 } 445 446 func (parser *CSVParser) tryReadEscaped(bs byte) (bool, byte, error) { 447 if parser.escapedBy == "" { 448 return false, 0, nil 449 } 450 if bs != parser.escapedBy[0] || parser.escFlavor == escapeFlavorNone { 451 return false, 0, nil 452 } 453 b, err := parser.readByte() 454 return true, b, parser.replaceEOF(err, errDanglingBackslash) 455 } 456 457 // readQuoteToken reads a token inside quoted fields. 458 func (parser *CSVParser) readQuotedToken(b byte) (csvToken, error) { 459 if ok, err := parser.tryReadCloseDelimiter(b); ok || err != nil { 460 return csvTokenDelimiter, err 461 } 462 if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil { 463 return csvTokenEscaped | csvToken(eb), err 464 } 465 return csvToken(b), nil 466 } 467 468 // readUnquoteToken reads a token outside quoted fields. 469 func (parser *CSVParser) readUnquoteToken(b byte) (csvToken, error) { 470 if ok, err := parser.tryReadNewLine(b); ok || err != nil { 471 return csvTokenNewLine, err 472 } 473 if ok, err := parser.tryReadComma(b); ok || err != nil { 474 return csvTokenComma, err 475 } 476 if ok, err := parser.tryReadOpenDelimiter(b); ok || err != nil { 477 return csvTokenDelimiter, err 478 } 479 if ok, eb, err := parser.tryReadEscaped(b); ok || err != nil { 480 return csvTokenEscaped | csvToken(eb), err 481 } 482 return csvToken(b), nil 483 } 484 485 func (parser *CSVParser) appendCSVTokenToRecordBuffer(token csvToken) { 486 if token&csvTokenEscaped != 0 { 487 parser.recordBuffer = append(parser.recordBuffer, parser.escapedBy[0]) 488 } 489 parser.recordBuffer = append(parser.recordBuffer, byte(token)) 490 } 491 492 // readUntil reads the buffer until any character from the `chars` set is found. 493 // that character is excluded from the final buffer. 494 func (parser *CSVParser) readUntil(chars *byteSet) ([]byte, byte, error) { 495 index := IndexAnyByte(parser.buf, chars) 496 if index >= 0 { 497 ret := parser.buf[:index] 498 parser.buf = parser.buf[index:] 499 parser.pos += int64(index) 500 return ret, parser.buf[0], nil 501 } 502 503 // not found in parser.buf, need allocate and loop. 504 var buf []byte 505 for { 506 buf = append(buf, parser.buf...) 507 if len(buf) > LargestEntryLimit { 508 return buf, 0, moerr.NewInternalErrorNoCtx("size of row cannot exceed the max value of txn-entry-size-limit") 509 } 510 parser.buf = nil 511 if err := parser.readBlock(); err != nil || len(parser.buf) == 0 { 512 if err == nil { 513 err = io.EOF 514 } 515 parser.pos += int64(len(buf)) 516 return buf, 0, err 517 } 518 index := IndexAnyByte(parser.buf, chars) 519 if index >= 0 { 520 buf = append(buf, parser.buf[:index]...) 521 parser.buf = parser.buf[index:] 522 parser.pos += int64(len(buf)) 523 return buf, parser.buf[0], nil 524 } 525 } 526 } 527 528 func (parser *CSVParser) readRecord(dst []field) ([]field, error) { 529 parser.recordBuffer = parser.recordBuffer[:0] 530 parser.fieldIndexes = parser.fieldIndexes[:0] 531 parser.fieldIsQuoted = parser.fieldIsQuoted[:0] 532 533 isEmptyLine := true 534 whitespaceLine := true 535 foundStartingByThisLine := false 536 prevToken := csvTokenNewLine 537 fieldIsQuoted := false 538 var firstToken csvToken 539 540 outside: 541 for { 542 // we should drop 543 // 1. the whole line if it does not contain startingBy 544 // 2. any character before startingBy 545 // since we have checked startingBy does not contain terminator, we can 546 // split at terminator to check the substring contains startingBy. Even 547 // if the terminator is inside a quoted field which means it's not the 548 // end of a line, the substring can still be dropped by rule 2. 549 if len(parser.startingBy) > 0 && !foundStartingByThisLine { 550 oldPos := parser.pos 551 content, _, err := parser.readUntilTerminator() 552 if err != nil { 553 if len(content) == 0 { 554 return nil, err 555 } 556 // if we reached EOF, we should still check the content contains 557 // startingBy and try to put back and parse it. 558 } 559 idx := bytes.Index(content, parser.startingBy) 560 if idx == -1 { 561 continue 562 } 563 foundStartingByThisLine = true 564 content = content[idx+len(parser.startingBy):] 565 parser.buf = append(content, parser.buf...) 566 parser.pos = oldPos + int64(idx+len(parser.startingBy)) 567 } 568 569 content, firstByte, err := parser.readUntil(&parser.unquoteByteSet) 570 571 if len(content) > 0 { 572 isEmptyLine = false 573 if prevToken == csvTokenDelimiter { 574 return nil, errUnexpectedQuoteField 575 } 576 parser.recordBuffer = append(parser.recordBuffer, content...) 577 prevToken = csvTokenAnyUnquoted 578 } 579 580 if err != nil { 581 if isEmptyLine || err != io.EOF { 582 return nil, err 583 } 584 // treat EOF as the same as trailing \n. 585 firstToken = csvTokenNewLine 586 } else { 587 parser.skipBytes(1) 588 firstToken, err = parser.readUnquoteToken(firstByte) 589 if err != nil { 590 return nil, err 591 } 592 } 593 594 switch firstToken { 595 case csvTokenComma: 596 whitespaceLine = false 597 parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) 598 parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted) 599 fieldIsQuoted = false 600 case csvTokenDelimiter: 601 if prevToken != csvTokenComma && prevToken != csvTokenNewLine { 602 if parser.unescapedQuote { 603 whitespaceLine = false 604 parser.recordBuffer = append(parser.recordBuffer, parser.quote...) 605 continue 606 } 607 return nil, errUnexpectedQuoteField 608 } 609 if err = parser.readQuotedField(); err != nil { 610 return nil, err 611 } 612 fieldIsQuoted = true 613 whitespaceLine = false 614 case csvTokenNewLine: 615 foundStartingByThisLine = false 616 // new line = end of field (ignore empty lines) 617 prevToken = firstToken 618 if !parser.allowEmptyLine { 619 if isEmptyLine { 620 continue 621 } 622 // skip lines only contain whitespaces 623 if err == nil && whitespaceLine && len(bytes.TrimSpace(parser.recordBuffer)) == 0 { 624 parser.recordBuffer = parser.recordBuffer[:0] 625 continue 626 } 627 } 628 // skip lines start with comment 629 if err == nil && parser.comment != 0 && parser.recordBuffer[0] == parser.comment { 630 parser.recordBuffer = parser.recordBuffer[:0] 631 parser.fieldIndexes = parser.fieldIndexes[:0] 632 parser.fieldIsQuoted = parser.fieldIsQuoted[:0] 633 634 isEmptyLine = true 635 whitespaceLine = true 636 foundStartingByThisLine = false 637 prevToken = csvTokenNewLine 638 fieldIsQuoted = false 639 continue 640 } 641 if bytes.Equal(parser.newLine, []byte{'\n'}) { 642 if n := len(parser.recordBuffer); n > 1 && parser.recordBuffer[n-1] == '\r' { 643 parser.recordBuffer = parser.recordBuffer[:n-1] 644 } 645 } 646 parser.fieldIndexes = append(parser.fieldIndexes, len(parser.recordBuffer)) 647 parser.fieldIsQuoted = append(parser.fieldIsQuoted, fieldIsQuoted) 648 // the loop is end, no need to reset fieldIsQuoted 649 break outside 650 default: 651 if prevToken == csvTokenDelimiter { 652 return nil, errUnexpectedQuoteField 653 } 654 parser.appendCSVTokenToRecordBuffer(firstToken) 655 } 656 prevToken = firstToken 657 isEmptyLine = false 658 } 659 // Create a single string and create slices out of it. 660 // This pins the memory of the fields together, but allocates once. 661 str := string(parser.recordBuffer) // Convert to string once to batch allocations 662 dst = dst[:0] 663 if cap(dst) < len(parser.fieldIndexes) { 664 dst = make([]field, len(parser.fieldIndexes)) 665 } 666 dst = dst[:len(parser.fieldIndexes)] 667 var preIdx int 668 for i, idx := range parser.fieldIndexes { 669 dst[i].content = str[preIdx:idx] 670 dst[i].quoted = parser.fieldIsQuoted[i] 671 preIdx = idx 672 } 673 674 // Check or update the expected fields per field. 675 return dst, nil 676 } 677 678 func (parser *CSVParser) readQuotedField() error { 679 for { 680 prevPos := parser.pos 681 content, terminator, err := parser.readUntil(&parser.quoteByteSet) 682 if err != nil { 683 if err == io.EOF { 684 // return the position of quote to the caller. 685 // because we return an error here, the parser won't 686 // use the `pos` again, so it's safe to modify it here. 687 parser.pos = prevPos - 1 688 // set buf to parser.buf in order to print err log 689 parser.buf = content 690 err = parser.replaceEOF(err, errUnterminatedQuotedField) 691 } 692 return err 693 } 694 parser.recordBuffer = append(parser.recordBuffer, content...) 695 parser.skipBytes(1) 696 697 token, err := parser.readQuotedToken(terminator) 698 if err != nil { 699 return err 700 } 701 702 switch token { 703 case csvTokenDelimiter: 704 // encountered '"' -> continue if we're seeing '""'. 705 doubledDelimiter, err := parser.tryReadExact(parser.quote) 706 if err != nil { 707 return err 708 } 709 if doubledDelimiter { 710 // consume the double quotation mark and continue 711 parser.recordBuffer = append(parser.recordBuffer, parser.quote...) 712 } else if parser.unescapedQuote { 713 // allow unescaped quote inside quoted field, so we only finish 714 // reading the field when we see a delimiter + comma/newline. 715 comma, _, err2 := parser.tryPeekExact(parser.comma) 716 if comma || err2 != nil { 717 return err2 718 } 719 newline, eof, err2 := parser.tryPeekExact(parser.newLine) 720 if eof || newline { 721 return nil 722 } 723 if err2 != nil { 724 return err2 725 } 726 parser.recordBuffer = append(parser.recordBuffer, parser.quote...) 727 } else { 728 // the field is completed, exit. 729 return nil 730 } 731 default: 732 parser.appendCSVTokenToRecordBuffer(token) 733 } 734 } 735 } 736 737 func (parser *CSVParser) replaceEOF(err error, replaced error) error { 738 if err == nil || err != io.EOF { 739 return err 740 } 741 return replaced 742 } 743 744 // readColumns reads the columns of this CSV file. 745 func (parser *CSVParser) readColumns() error { 746 columns, err := parser.readRecord(nil) 747 if err != nil { 748 return err 749 } 750 if !parser.cfg.HeaderSchemaMatch { 751 return nil 752 } 753 parser.columns = make([]string, 0, len(columns)) 754 for _, colName := range columns { 755 colNameStr, _, err := parser.unescapeString(colName) 756 if err != nil { 757 return err 758 } 759 parser.columns = append(parser.columns, strings.ToLower(colNameStr)) 760 } 761 return nil 762 } 763 764 // readUntilTerminator seeks the file until the terminator token is found, and 765 // returns 766 // - the content with terminator, or the content read before meet error 767 // - the file offset beyond the terminator, or the offset when meet error 768 // - error 769 // Note that the terminator string pattern may be the content of a field, which 770 // means it's inside quotes. Caller should make sure to handle this case. 771 func (parser *CSVParser) readUntilTerminator() ([]byte, int64, error) { 772 var ret []byte 773 for { 774 content, firstByte, err := parser.readUntil(&parser.newLineByteSet) 775 ret = append(ret, content...) 776 if err != nil { 777 return ret, parser.pos, err 778 } 779 parser.skipBytes(1) 780 ret = append(ret, firstByte) 781 if ok, err := parser.tryReadNewLine(firstByte); ok || err != nil { 782 if len(parser.newLine) >= 1 { 783 ret = append(ret, parser.newLine[1:]...) 784 } 785 return ret, parser.pos, err 786 } 787 } 788 } 789 790 func (parser *CSVParser) readBlock() error { 791 n, err := io.ReadFull(parser.reader, parser.blockBuf) 792 793 switch { 794 case errors.Is(err, io.ErrUnexpectedEOF), err == io.EOF: 795 parser.isLastChunk = true 796 fallthrough 797 case err == nil: 798 // `parser.buf` reference to `appendBuf.Bytes`, so should use remainBuf to 799 // hold the `parser.buf` rest data to prevent slice overlap 800 parser.remainBuf.Reset() 801 parser.remainBuf.Write(parser.buf) 802 parser.appendBuf.Reset() 803 parser.appendBuf.Write(parser.remainBuf.Bytes()) 804 blockData := parser.blockBuf[:n] 805 if parser.pos == 0 { 806 bomCleanedData := bom.Clean(blockData) 807 parser.pos += int64(n - len(bomCleanedData)) 808 blockData = bomCleanedData 809 } 810 parser.appendBuf.Write(blockData) 811 parser.buf = parser.appendBuf.Bytes() 812 return nil 813 default: 814 return err 815 } 816 } 817 818 func (parser *CSVParser) Columns() []string { 819 return parser.columns 820 } 821 822 func (parser *CSVParser) SetColumns(columns []string) { 823 parser.columns = columns 824 } 825 826 func unescape( 827 input string, 828 delim string, 829 escFlavor escapeFlavor, 830 escChar byte, 831 unescapeRegexp *regexp.Regexp, 832 ) string { 833 if len(delim) > 0 { 834 delim2 := delim + delim 835 if strings.Contains(input, delim2) { 836 input = strings.ReplaceAll(input, delim2, delim) 837 } 838 } 839 if escFlavor != escapeFlavorNone && strings.IndexByte(input, escChar) != -1 { 840 input = unescapeRegexp.ReplaceAllStringFunc(input, func(substr string) string { 841 switch substr[1] { 842 case '0': 843 return "\x00" 844 case 'b': 845 return "\b" 846 case 'n': 847 return "\n" 848 case 'r': 849 return "\r" 850 case 't': 851 return "\t" 852 case 'Z': 853 return "\x1a" 854 default: 855 return substr[1:] 856 } 857 }) 858 } 859 return input 860 } 861 862 // Copyright 2009 The Go Authors. All rights reserved. 863 // Use of this source code is governed by a BSD-style 864 // license that can be found in the LICENSE file. 865 866 // Package bytes implements functions for the manipulation of byte slices. 867 // It is analogous to the facilities of the strings package. 868 869 // this part is copy from `bytes/bytes.go` 870 871 // byteSet is a 32-byte value, where each bit represents the presence of a 872 // given byte value in the set. 873 type byteSet [8]uint32 874 875 // makeByteSet creates a set of byte value. 876 func makeByteSet(chars []byte) (as byteSet) { 877 for i := 0; i < len(chars); i++ { 878 c := chars[i] 879 as[c>>5] |= 1 << uint(c&31) 880 } 881 return as 882 } 883 884 // contains reports whether c is inside the set. 885 func (as *byteSet) contains(c byte) bool { 886 return (as[c>>5] & (1 << uint(c&31))) != 0 887 } 888 889 // IndexAnyByte returns the byte index of the first occurrence in s of any in the byte 890 // points in chars. It returns -1 if there is no code point in common. 891 func IndexAnyByte(s []byte, as *byteSet) int { 892 for i, c := range s { 893 if as.contains(c) { 894 return i 895 } 896 } 897 return -1 898 }