github.com/hlts2/go@v0.0.0-20170904000733-812b34efaed8/src/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // There are many kinds of CSV files; this package supports the format 7 // described in RFC 4180. 8 // 9 // A csv file contains zero or more records of one or more fields per record. 10 // Each record is separated by the newline character. The final record may 11 // optionally be followed by a newline character. 12 // 13 // field1,field2,field3 14 // 15 // White space is considered part of a field. 16 // 17 // Carriage returns before newline characters are silently removed. 18 // 19 // Blank lines are ignored. A line with only whitespace characters (excluding 20 // the ending newline character) is not considered a blank line. 21 // 22 // Fields which start and stop with the quote character " are called 23 // quoted-fields. The beginning and ending quote are not part of the 24 // field. 25 // 26 // The source: 27 // 28 // normal string,"quoted-field" 29 // 30 // results in the fields 31 // 32 // {`normal string`, `quoted-field`} 33 // 34 // Within a quoted-field a quote character followed by a second quote 35 // character is considered a single quote. 36 // 37 // "the ""word"" is true","a ""quoted-field""" 38 // 39 // results in 40 // 41 // {`the "word" is true`, `a "quoted-field"`} 42 // 43 // Newlines and commas may be included in a quoted-field 44 // 45 // "Multi-line 46 // field","comma is ," 47 // 48 // results in 49 // 50 // {`Multi-line 51 // field`, `comma is ,`} 52 package csv 53 54 import ( 55 "bufio" 56 "bytes" 57 "errors" 58 "fmt" 59 "io" 60 "unicode" 61 ) 62 63 // A ParseError is returned for parsing errors. 64 // The first line is 1. The first column is 0. 65 type ParseError struct { 66 Line int // Line where the error occurred 67 Column int // Column (rune index) where the error occurred 68 Err error // The actual error 69 } 70 71 func (e *ParseError) Error() string { 72 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) 73 } 74 75 // These are the errors that can be returned in ParseError.Error 76 var ( 77 ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used 78 ErrBareQuote = errors.New("bare \" in non-quoted-field") 79 ErrQuote = errors.New("extraneous \" in field") 80 ErrFieldCount = errors.New("wrong number of fields in line") 81 ) 82 83 // A Reader reads records from a CSV-encoded file. 84 // 85 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 86 // The exported fields can be changed to customize the details before the 87 // first call to Read or ReadAll. 88 // 89 // 90 type Reader struct { 91 // Comma is the field delimiter. 92 // It is set to comma (',') by NewReader. 93 Comma rune 94 // Comment, if not 0, is the comment character. Lines beginning with the 95 // Comment character without preceding whitespace are ignored. 96 // With leading whitespace the Comment character becomes part of the 97 // field, even if TrimLeadingSpace is true. 98 Comment rune 99 // FieldsPerRecord is the number of expected fields per record. 100 // If FieldsPerRecord is positive, Read requires each record to 101 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 102 // the number of fields in the first record, so that future records must 103 // have the same field count. If FieldsPerRecord is negative, no check is 104 // made and records may have a variable number of fields. 105 FieldsPerRecord int 106 // If LazyQuotes is true, a quote may appear in an unquoted field and a 107 // non-doubled quote may appear in a quoted field. 108 LazyQuotes bool 109 TrailingComma bool // ignored; here for backwards compatibility 110 // If TrimLeadingSpace is true, leading white space in a field is ignored. 111 // This is done even if the field delimiter, Comma, is white space. 112 TrimLeadingSpace bool 113 // ReuseRecord controls whether calls to Read may return a slice sharing 114 // the backing array of the previous call's returned slice for performance. 115 // By default, each call to Read returns newly allocated memory owned by the caller. 116 ReuseRecord bool 117 118 line int 119 recordLine int // line where the current record started 120 column int 121 r *bufio.Reader 122 // lineBuffer holds the unescaped fields read by readField, one after another. 123 // The fields can be accessed by using the indexes in fieldIndexes. 124 // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and 125 // fieldIndexes will contain the indexes 0, 1, 2, 5. 126 lineBuffer bytes.Buffer 127 // Indexes of fields inside lineBuffer 128 // The i'th field starts at offset fieldIndexes[i] in lineBuffer. 129 fieldIndexes []int 130 131 // only used when ReuseRecord == true 132 lastRecord []string 133 } 134 135 // NewReader returns a new Reader that reads from r. 136 func NewReader(r io.Reader) *Reader { 137 return &Reader{ 138 Comma: ',', 139 r: bufio.NewReader(r), 140 } 141 } 142 143 // error creates a new ParseError based on err. 144 func (r *Reader) error(err error) error { 145 return &ParseError{ 146 Line: r.recordLine, 147 Column: r.column, 148 Err: err, 149 } 150 } 151 152 // Read reads one record (a slice of fields) from r. 153 // If the record has an unexpected number of fields, 154 // Read returns the record along with the error ErrFieldCount. 155 // Except for that case, Read always returns either a non-nil 156 // record or a non-nil error, but not both. 157 // If there is no data left to be read, Read returns nil, io.EOF. 158 // If ReuseRecord is true, the returned slice may be shared 159 // between multiple calls to Read. 160 func (r *Reader) Read() (record []string, err error) { 161 if r.ReuseRecord { 162 record, err = r.readRecord(r.lastRecord) 163 r.lastRecord = record 164 } else { 165 record, err = r.readRecord(nil) 166 } 167 168 return record, err 169 } 170 171 // ReadAll reads all the remaining records from r. 172 // Each record is a slice of fields. 173 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 174 // defined to read until EOF, it does not treat end of file as an error to be 175 // reported. 176 func (r *Reader) ReadAll() (records [][]string, err error) { 177 for { 178 record, err := r.readRecord(nil) 179 if err == io.EOF { 180 return records, nil 181 } 182 if err != nil { 183 return nil, err 184 } 185 records = append(records, record) 186 } 187 } 188 189 // readRecord reads and parses a single csv record from r. 190 // Unlike parseRecord, readRecord handles FieldsPerRecord. 191 // If dst has enough capacity it will be used for the returned record. 192 func (r *Reader) readRecord(dst []string) (record []string, err error) { 193 for { 194 record, err = r.parseRecord(dst) 195 if record != nil { 196 break 197 } 198 if err != nil { 199 return nil, err 200 } 201 } 202 203 if r.FieldsPerRecord > 0 { 204 if len(record) != r.FieldsPerRecord { 205 r.column = 0 // report at start of record 206 return record, r.error(ErrFieldCount) 207 } 208 } else if r.FieldsPerRecord == 0 { 209 r.FieldsPerRecord = len(record) 210 } 211 return record, nil 212 } 213 214 // readRune reads one rune from r, folding \r\n to \n and keeping track 215 // of how far into the line we have read. r.column will point to the start 216 // of this rune, not the end of this rune. 217 func (r *Reader) readRune() (rune, error) { 218 r1, _, err := r.r.ReadRune() 219 220 // Handle \r\n here. We make the simplifying assumption that 221 // anytime \r is followed by \n that it can be folded to \n. 222 // We will not detect files which contain both \r\n and bare \n. 223 if r1 == '\r' { 224 r1, _, err = r.r.ReadRune() 225 if err == nil { 226 if r1 != '\n' { 227 r.r.UnreadRune() 228 r1 = '\r' 229 } 230 } 231 } 232 r.column++ 233 return r1, err 234 } 235 236 // readRawRune works the same way as readRune, but does not fold \r\n to \n. 237 func (r *Reader) readRawRune() (rune, error) { 238 r1, _, err := r.r.ReadRune() 239 r.column++ 240 return r1, err 241 } 242 243 // skip reads runes up to and including the rune delim or until error. 244 func (r *Reader) skip(delim rune) error { 245 for { 246 r1, err := r.readRune() 247 if err != nil { 248 return err 249 } 250 if r1 == delim { 251 return nil 252 } 253 } 254 } 255 256 // parseRecord reads and parses a single csv record from r. 257 // If dst has enough capacity it will be used for the returned fields. 258 func (r *Reader) parseRecord(dst []string) (fields []string, err error) { 259 // Each record starts on a new line. We increment our line 260 // number (lines start at 1, not 0) and set column to -1 261 // so as we increment in readRune it points to the character we read. 262 // We track the line where the record starts in recordLine for use in errors. 263 r.line++ 264 r.recordLine = r.line 265 r.column = -1 266 267 // Peek at the first rune. If it is an error we are done. 268 // If we support comments and it is the comment character 269 // then skip to the end of line. 270 271 r1, _, err := r.r.ReadRune() 272 if err != nil { 273 return nil, err 274 } 275 276 if r.Comment != 0 && r1 == r.Comment { 277 return nil, r.skip('\n') 278 } 279 r.r.UnreadRune() 280 281 r.lineBuffer.Reset() 282 r.fieldIndexes = r.fieldIndexes[:0] 283 284 // At this point we have at least one field. 285 for { 286 idx := r.lineBuffer.Len() 287 288 haveField, delim, err := r.parseField() 289 if haveField { 290 r.fieldIndexes = append(r.fieldIndexes, idx) 291 } 292 293 if delim == '\n' || err == io.EOF { 294 if len(r.fieldIndexes) == 0 { 295 return nil, err 296 } 297 break 298 } 299 300 if err != nil { 301 return nil, err 302 } 303 } 304 305 fieldCount := len(r.fieldIndexes) 306 // Using this approach (creating a single string and taking slices of it) 307 // means that a single reference to any of the fields will retain the whole 308 // string. The risk of a nontrivial space leak caused by this is considered 309 // minimal and a tradeoff for better performance through the combined 310 // allocations. 311 line := r.lineBuffer.String() 312 313 if cap(dst) >= fieldCount { 314 fields = dst[:fieldCount] 315 } else { 316 fields = make([]string, fieldCount) 317 } 318 319 for i, idx := range r.fieldIndexes { 320 if i == fieldCount-1 { 321 fields[i] = line[idx:] 322 } else { 323 fields[i] = line[idx:r.fieldIndexes[i+1]] 324 } 325 } 326 327 return fields, nil 328 } 329 330 // parseField parses the next field in the record. The read field is 331 // appended to r.lineBuffer. Delim is the first character not part of the field 332 // (r.Comma or '\n'). 333 func (r *Reader) parseField() (haveField bool, delim rune, err error) { 334 r1, err := r.readRune() 335 for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { 336 r1, err = r.readRune() 337 } 338 339 if err == io.EOF && r.column != 0 { 340 return true, 0, err 341 } 342 if err != nil { 343 return false, 0, err 344 } 345 346 switch r1 { 347 case r.Comma: 348 // will check below 349 350 case '\n': 351 // We are a trailing empty field or a blank line 352 if r.column == 0 { 353 return false, r1, nil 354 } 355 return true, r1, nil 356 357 case '"': 358 // quoted field 359 Quoted: 360 for { 361 // use readRawRune instead of readRune to preserve \r\n 362 // in quotes fields. 363 r1, err = r.readRawRune() 364 if err != nil { 365 if err == io.EOF { 366 if r.LazyQuotes { 367 return true, 0, err 368 } 369 return false, 0, r.error(ErrQuote) 370 } 371 return false, 0, err 372 } 373 switch r1 { 374 case '"': 375 r1, err = r.readRune() 376 if err != nil || r1 == r.Comma { 377 break Quoted 378 } 379 if r1 == '\n' { 380 return true, r1, nil 381 } 382 if r1 != '"' { 383 if !r.LazyQuotes { 384 r.column-- 385 return false, 0, r.error(ErrQuote) 386 } 387 // accept the bare quote 388 r.lineBuffer.WriteRune('"') 389 } 390 case '\n': 391 r.line++ 392 r.column = -1 393 } 394 r.lineBuffer.WriteRune(r1) 395 } 396 397 default: 398 // unquoted field 399 for { 400 r.lineBuffer.WriteRune(r1) 401 r1, err = r.readRune() 402 if err != nil || r1 == r.Comma { 403 break 404 } 405 if r1 == '\n' { 406 return true, r1, nil 407 } 408 if !r.LazyQuotes && r1 == '"' { 409 return false, 0, r.error(ErrBareQuote) 410 } 411 } 412 } 413 414 if err != nil { 415 if err == io.EOF { 416 return true, 0, err 417 } 418 return false, 0, err 419 } 420 421 return true, r1, nil 422 }