github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // There are many kinds of CSV files; this package supports the format 7 // described in RFC 4180. 8 // 9 // A csv file contains zero or more records of one or more fields per record. 10 // Each record is separated by the newline character. The final record may 11 // optionally be followed by a newline character. 12 // 13 // field1,field2,field3 14 // 15 // White space is considered part of a field. 16 // 17 // Carriage returns before newline characters are silently removed. 18 // 19 // Blank lines are ignored. A line with only whitespace characters (excluding 20 // the ending newline character) is not considered a blank line. 21 // 22 // Fields which start and stop with the quote character " are called 23 // quoted-fields. The beginning and ending quote are not part of the 24 // field. 25 // 26 // The source: 27 // 28 // normal string,"quoted-field" 29 // 30 // results in the fields 31 // 32 // {`normal string`, `quoted-field`} 33 // 34 // Within a quoted-field a quote character followed by a second quote 35 // character is considered a single quote. 36 // 37 // "the ""word"" is true","a ""quoted-field""" 38 // 39 // results in 40 // 41 // {`the "word" is true`, `a "quoted-field"`} 42 // 43 // Newlines and commas may be included in a quoted-field 44 // 45 // "Multi-line 46 // field","comma is ," 47 // 48 // results in 49 // 50 // {`Multi-line 51 // field`, `comma is ,`} 52 package csv 53 54 import ( 55 "bufio" 56 "bytes" 57 "errors" 58 "fmt" 59 "io" 60 "unicode" 61 "unicode/utf8" 62 ) 63 64 // A ParseError is returned for parsing errors. 65 // Line numbers are 1-indexed and columns are 0-indexed. 66 type ParseError struct { 67 StartLine int // Line where the record starts 68 Line int // Line where the error occurred 69 Column int // Column (1-based byte index) where the error occurred 70 Err error // The actual error 71 } 72 73 func (e *ParseError) Error() string { 74 if e.Err == ErrFieldCount { 75 return fmt.Sprintf("record on line %d: %v", e.Line, e.Err) 76 } 77 if e.StartLine != e.Line { 78 return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err) 79 } 80 return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err) 81 } 82 83 func (e *ParseError) Unwrap() error { return e.Err } 84 85 // These are the errors that can be returned in ParseError.Err. 86 var ( 87 ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used. 88 ErrBareQuote = errors.New("bare \" in non-quoted-field") 89 ErrQuote = errors.New("extraneous or missing \" in quoted-field") 90 ErrFieldCount = errors.New("wrong number of fields") 91 ) 92 93 var errInvalidDelim = errors.New("csv: invalid field or comment delimiter") 94 95 func validDelim(r rune) bool { 96 return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError 97 } 98 99 // A Reader reads records from a CSV-encoded file. 100 // 101 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 102 // The exported fields can be changed to customize the details before the 103 // first call to Read or ReadAll. 104 // 105 // The Reader converts all \r\n sequences in its input to plain \n, 106 // including in multiline field values, so that the returned data does 107 // not depend on which line-ending convention an input file uses. 108 type Reader struct { 109 // Comma is the field delimiter. 110 // It is set to comma (',') by NewReader. 111 // Comma must be a valid rune and must not be \r, \n, 112 // or the Unicode replacement character (0xFFFD). 113 Comma rune 114 115 // Comment, if not 0, is the comment character. Lines beginning with the 116 // Comment character without preceding whitespace are ignored. 117 // With leading whitespace the Comment character becomes part of the 118 // field, even if TrimLeadingSpace is true. 119 // Comment must be a valid rune and must not be \r, \n, 120 // or the Unicode replacement character (0xFFFD). 121 // It must also not be equal to Comma. 122 Comment rune 123 124 // FieldsPerRecord is the number of expected fields per record. 125 // If FieldsPerRecord is positive, Read requires each record to 126 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 127 // the number of fields in the first record, so that future records must 128 // have the same field count. If FieldsPerRecord is negative, no check is 129 // made and records may have a variable number of fields. 130 FieldsPerRecord int 131 132 // If LazyQuotes is true, a quote may appear in an unquoted field and a 133 // non-doubled quote may appear in a quoted field. 134 LazyQuotes bool 135 136 // If TrimLeadingSpace is true, leading white space in a field is ignored. 137 // This is done even if the field delimiter, Comma, is white space. 138 TrimLeadingSpace bool 139 140 // ReuseRecord controls whether calls to Read may return a slice sharing 141 // the backing array of the previous call's returned slice for performance. 142 // By default, each call to Read returns newly allocated memory owned by the caller. 143 ReuseRecord bool 144 145 TrailingComma bool // Deprecated: No longer used. 146 147 r *bufio.Reader 148 149 // numLine is the current line being read in the CSV file. 150 numLine int 151 152 // offset is the input stream byte offset of the current reader position. 153 offset int64 154 155 // rawBuffer is a line buffer only used by the readLine method. 156 rawBuffer []byte 157 158 // recordBuffer holds the unescaped fields, one after another. 159 // The fields can be accessed by using the indexes in fieldIndexes. 160 // E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de` 161 // and fieldIndexes will contain the indexes [1, 2, 5, 6]. 162 recordBuffer []byte 163 164 // fieldIndexes is an index of fields inside recordBuffer. 165 // The i'th field ends at offset fieldIndexes[i] in recordBuffer. 166 fieldIndexes []int 167 168 // fieldPositions is an index of field positions for the 169 // last record returned by Read. 170 fieldPositions []position 171 172 // lastRecord is a record cache and only used when ReuseRecord == true. 173 lastRecord []string 174 } 175 176 // NewReader returns a new Reader that reads from r. 177 func NewReader(r io.Reader) *Reader { 178 return &Reader{ 179 Comma: ',', 180 r: bufio.NewReader(r), 181 } 182 } 183 184 // Read reads one record (a slice of fields) from r. 185 // If the record has an unexpected number of fields, 186 // Read returns the record along with the error ErrFieldCount. 187 // Except for that case, Read always returns either a non-nil 188 // record or a non-nil error, but not both. 189 // If there is no data left to be read, Read returns nil, io.EOF. 190 // If ReuseRecord is true, the returned slice may be shared 191 // between multiple calls to Read. 192 func (r *Reader) Read() (record []string, err error) { 193 if r.ReuseRecord { 194 record, err = r.readRecord(r.lastRecord) 195 r.lastRecord = record 196 } else { 197 record, err = r.readRecord(nil) 198 } 199 return record, err 200 } 201 202 // FieldPos returns the line and column corresponding to 203 // the start of the field with the given index in the slice most recently 204 // returned by Read. Numbering of lines and columns starts at 1; 205 // columns are counted in bytes, not runes. 206 // 207 // If this is called with an out-of-bounds index, it panics. 208 func (r *Reader) FieldPos(field int) (line, column int) { 209 if field < 0 || field >= len(r.fieldPositions) { 210 panic("out of range index passed to FieldPos") 211 } 212 p := &r.fieldPositions[field] 213 return p.line, p.col 214 } 215 216 // InputOffset returns the input stream byte offset of the current reader 217 // position. The offset gives the location of the end of the most recently 218 // read row and the beginning of the next row. 219 func (r *Reader) InputOffset() int64 { 220 return r.offset 221 } 222 223 // pos holds the position of a field in the current line. 224 type position struct { 225 line, col int 226 } 227 228 // ReadAll reads all the remaining records from r. 229 // Each record is a slice of fields. 230 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 231 // defined to read until EOF, it does not treat end of file as an error to be 232 // reported. 233 func (r *Reader) ReadAll() (records [][]string, err error) { 234 for { 235 record, err := r.readRecord(nil) 236 if err == io.EOF { 237 return records, nil 238 } 239 if err != nil { 240 return nil, err 241 } 242 records = append(records, record) 243 } 244 } 245 246 // readLine reads the next line (with the trailing endline). 247 // If EOF is hit without a trailing endline, it will be omitted. 248 // If some bytes were read, then the error is never io.EOF. 249 // The result is only valid until the next call to readLine. 250 func (r *Reader) readLine() ([]byte, error) { 251 line, err := r.r.ReadSlice('\n') 252 if err == bufio.ErrBufferFull { 253 r.rawBuffer = append(r.rawBuffer[:0], line...) 254 for err == bufio.ErrBufferFull { 255 line, err = r.r.ReadSlice('\n') 256 r.rawBuffer = append(r.rawBuffer, line...) 257 } 258 line = r.rawBuffer 259 } 260 readSize := len(line) 261 if readSize > 0 && err == io.EOF { 262 err = nil 263 // For backwards compatibility, drop trailing \r before EOF. 264 if line[readSize-1] == '\r' { 265 line = line[:readSize-1] 266 } 267 } 268 r.numLine++ 269 r.offset += int64(readSize) 270 // Normalize \r\n to \n on all input lines. 271 if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' { 272 line[n-2] = '\n' 273 line = line[:n-1] 274 } 275 return line, err 276 } 277 278 // lengthNL reports the number of bytes for the trailing \n. 279 func lengthNL(b []byte) int { 280 if len(b) > 0 && b[len(b)-1] == '\n' { 281 return 1 282 } 283 return 0 284 } 285 286 // nextRune returns the next rune in b or utf8.RuneError. 287 func nextRune(b []byte) rune { 288 r, _ := utf8.DecodeRune(b) 289 return r 290 } 291 292 func (r *Reader) readRecord(dst []string) ([]string, error) { 293 if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) { 294 return nil, errInvalidDelim 295 } 296 297 // Read line (automatically skipping past empty lines and any comments). 298 var line []byte 299 var errRead error 300 for errRead == nil { 301 line, errRead = r.readLine() 302 if r.Comment != 0 && nextRune(line) == r.Comment { 303 line = nil 304 continue // Skip comment lines 305 } 306 if errRead == nil && len(line) == lengthNL(line) { 307 line = nil 308 continue // Skip empty lines 309 } 310 break 311 } 312 if errRead == io.EOF { 313 return nil, errRead 314 } 315 316 // Parse each field in the record. 317 var err error 318 const quoteLen = len(`"`) 319 commaLen := utf8.RuneLen(r.Comma) 320 recLine := r.numLine // Starting line for record 321 r.recordBuffer = r.recordBuffer[:0] 322 r.fieldIndexes = r.fieldIndexes[:0] 323 r.fieldPositions = r.fieldPositions[:0] 324 pos := position{line: r.numLine, col: 1} 325 parseField: 326 for { 327 if r.TrimLeadingSpace { 328 i := bytes.IndexFunc(line, func(r rune) bool { 329 return !unicode.IsSpace(r) 330 }) 331 if i < 0 { 332 i = len(line) 333 pos.col -= lengthNL(line) 334 } 335 line = line[i:] 336 pos.col += i 337 } 338 if len(line) == 0 || line[0] != '"' { 339 // Non-quoted string field 340 i := bytes.IndexRune(line, r.Comma) 341 field := line 342 if i >= 0 { 343 field = field[:i] 344 } else { 345 field = field[:len(field)-lengthNL(field)] 346 } 347 // Check to make sure a quote does not appear in field. 348 if !r.LazyQuotes { 349 if j := bytes.IndexByte(field, '"'); j >= 0 { 350 col := pos.col + j 351 err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote} 352 break parseField 353 } 354 } 355 r.recordBuffer = append(r.recordBuffer, field...) 356 r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) 357 r.fieldPositions = append(r.fieldPositions, pos) 358 if i >= 0 { 359 line = line[i+commaLen:] 360 pos.col += i + commaLen 361 continue parseField 362 } 363 break parseField 364 } else { 365 // Quoted string field 366 fieldPos := pos 367 line = line[quoteLen:] 368 pos.col += quoteLen 369 for { 370 i := bytes.IndexByte(line, '"') 371 if i >= 0 { 372 // Hit next quote. 373 r.recordBuffer = append(r.recordBuffer, line[:i]...) 374 line = line[i+quoteLen:] 375 pos.col += i + quoteLen 376 switch rn := nextRune(line); { 377 case rn == '"': 378 // `""` sequence (append quote). 379 r.recordBuffer = append(r.recordBuffer, '"') 380 line = line[quoteLen:] 381 pos.col += quoteLen 382 case rn == r.Comma: 383 // `",` sequence (end of field). 384 line = line[commaLen:] 385 pos.col += commaLen 386 r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) 387 r.fieldPositions = append(r.fieldPositions, fieldPos) 388 continue parseField 389 case lengthNL(line) == len(line): 390 // `"\n` sequence (end of line). 391 r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) 392 r.fieldPositions = append(r.fieldPositions, fieldPos) 393 break parseField 394 case r.LazyQuotes: 395 // `"` sequence (bare quote). 396 r.recordBuffer = append(r.recordBuffer, '"') 397 default: 398 // `"*` sequence (invalid non-escaped quote). 399 err = &ParseError{StartLine: recLine, Line: r.numLine, Column: pos.col - quoteLen, Err: ErrQuote} 400 break parseField 401 } 402 } else if len(line) > 0 { 403 // Hit end of line (copy all data so far). 404 r.recordBuffer = append(r.recordBuffer, line...) 405 if errRead != nil { 406 break parseField 407 } 408 pos.col += len(line) 409 line, errRead = r.readLine() 410 if len(line) > 0 { 411 pos.line++ 412 pos.col = 1 413 } 414 if errRead == io.EOF { 415 errRead = nil 416 } 417 } else { 418 // Abrupt end of file (EOF or error). 419 if !r.LazyQuotes && errRead == nil { 420 err = &ParseError{StartLine: recLine, Line: pos.line, Column: pos.col, Err: ErrQuote} 421 break parseField 422 } 423 r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer)) 424 r.fieldPositions = append(r.fieldPositions, fieldPos) 425 break parseField 426 } 427 } 428 } 429 } 430 if err == nil { 431 err = errRead 432 } 433 434 // Create a single string and create slices out of it. 435 // This pins the memory of the fields together, but allocates once. 436 str := string(r.recordBuffer) // Convert to string once to batch allocations 437 dst = dst[:0] 438 if cap(dst) < len(r.fieldIndexes) { 439 dst = make([]string, len(r.fieldIndexes)) 440 } 441 dst = dst[:len(r.fieldIndexes)] 442 var preIdx int 443 for i, idx := range r.fieldIndexes { 444 dst[i] = str[preIdx:idx] 445 preIdx = idx 446 } 447 448 // Check or update the expected fields per record. 449 if r.FieldsPerRecord > 0 { 450 if len(dst) != r.FieldsPerRecord && err == nil { 451 err = &ParseError{ 452 StartLine: recLine, 453 Line: recLine, 454 Column: 1, 455 Err: ErrFieldCount, 456 } 457 } 458 } else if r.FieldsPerRecord == 0 { 459 r.FieldsPerRecord = len(dst) 460 } 461 return dst, err 462 }