github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // There are many kinds of CSV files; this package supports the format 7 // described in RFC 4180. 8 // 9 // A csv file contains zero or more records of one or more fields per record. 10 // Each record is separated by the newline character. The final record may 11 // optionally be followed by a newline character. 12 // 13 // field1,field2,field3 14 // 15 // White space is considered part of a field. 16 // 17 // Carriage returns before newline characters are silently removed. 18 // 19 // Blank lines are ignored. A line with only whitespace characters (excluding 20 // the ending newline character) is not considered a blank line. 21 // 22 // Fields which start and stop with the quote character " are called 23 // quoted-fields. The beginning and ending quote are not part of the 24 // field. 25 // 26 // The source: 27 // 28 // normal string,"quoted-field" 29 // 30 // results in the fields 31 // 32 // {`normal string`, `quoted-field`} 33 // 34 // Within a quoted-field a quote character followed by a second quote 35 // character is considered a single quote. 36 // 37 // "the ""word"" is true","a ""quoted-field""" 38 // 39 // results in 40 // 41 // {`the "word" is true`, `a "quoted-field"`} 42 // 43 // Newlines and commas may be included in a quoted-field 44 // 45 // "Multi-line 46 // field","comma is ," 47 // 48 // results in 49 // 50 // {`Multi-line 51 // field`, `comma is ,`} 52 package csv 53 54 import ( 55 "bufio" 56 "bytes" 57 "errors" 58 "fmt" 59 "io" 60 "unicode" 61 ) 62 63 // A ParseError is returned for parsing errors. 64 // The first line is 1. The first column is 0. 65 type ParseError struct { 66 Line int // Line where the error occurred 67 Column int // Column (rune index) where the error occurred 68 Err error // The actual error 69 } 70 71 func (e *ParseError) Error() string { 72 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) 73 } 74 75 // These are the errors that can be returned in ParseError.Error 76 var ( 77 ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used 78 ErrBareQuote = errors.New("bare \" in non-quoted-field") 79 ErrQuote = errors.New("extraneous \" in field") 80 ErrFieldCount = errors.New("wrong number of fields in line") 81 ) 82 83 // A Reader reads records from a CSV-encoded file. 84 // 85 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 86 // The exported fields can be changed to customize the details before the 87 // first call to Read or ReadAll. 88 // 89 // 90 type Reader struct { 91 // Comma is the field delimiter. 92 // It is set to comma (',') by NewReader. 93 Comma rune 94 // Comment, if not 0, is the comment character. Lines beginning with the 95 // Comment character without preceding whitespace are ignored. 96 // With leading whitespace the Comment character becomes part of the 97 // field, even if TrimLeadingSpace is true. 98 Comment rune 99 // FieldsPerRecord is the number of expected fields per record. 100 // If FieldsPerRecord is positive, Read requires each record to 101 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 102 // the number of fields in the first record, so that future records must 103 // have the same field count. If FieldsPerRecord is negative, no check is 104 // made and records may have a variable number of fields. 105 FieldsPerRecord int 106 // If LazyQuotes is true, a quote may appear in an unquoted field and a 107 // non-doubled quote may appear in a quoted field. 108 LazyQuotes bool 109 TrailingComma bool // ignored; here for backwards compatibility 110 // If TrimLeadingSpace is true, leading white space in a field is ignored. 111 // This is done even if the field delimiter, Comma, is white space. 112 TrimLeadingSpace bool 113 // ReuseRecord controls whether calls to Read may return a slice sharing 114 // the backing array of the previous call's returned slice for performance. 115 // By default, each call to Read returns newly allocated memory owned by the caller. 116 ReuseRecord bool 117 118 line int 119 column int 120 r *bufio.Reader 121 // lineBuffer holds the unescaped fields read by readField, one after another. 122 // The fields can be accessed by using the indexes in fieldIndexes. 123 // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and 124 // fieldIndexes will contain the indexes 0, 1, 2, 5. 125 lineBuffer bytes.Buffer 126 // Indexes of fields inside lineBuffer 127 // The i'th field starts at offset fieldIndexes[i] in lineBuffer. 128 fieldIndexes []int 129 130 // only used when ReuseRecord == true 131 lastRecord []string 132 } 133 134 // NewReader returns a new Reader that reads from r. 135 func NewReader(r io.Reader) *Reader { 136 return &Reader{ 137 Comma: ',', 138 r: bufio.NewReader(r), 139 } 140 } 141 142 // error creates a new ParseError based on err. 143 func (r *Reader) error(err error) error { 144 return &ParseError{ 145 Line: r.line, 146 Column: r.column, 147 Err: err, 148 } 149 } 150 151 // Read reads one record (a slice of fields) from r. 152 // If the record has an unexpected number of fields, 153 // Read returns the record along with the error ErrFieldCount. 154 // Except for that case, Read always returns either a non-nil 155 // record or a non-nil error, but not both. 156 // If there is no data left to be read, Read returns nil, io.EOF. 157 // If ReuseRecord is true, the returned slice may be shared 158 // between multiple calls to Read. 159 func (r *Reader) Read() (record []string, err error) { 160 if r.ReuseRecord { 161 record, err = r.readRecord(r.lastRecord) 162 r.lastRecord = record 163 } else { 164 record, err = r.readRecord(nil) 165 } 166 167 return record, err 168 } 169 170 // ReadAll reads all the remaining records from r. 171 // Each record is a slice of fields. 172 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 173 // defined to read until EOF, it does not treat end of file as an error to be 174 // reported. 175 func (r *Reader) ReadAll() (records [][]string, err error) { 176 for { 177 record, err := r.readRecord(nil) 178 if err == io.EOF { 179 return records, nil 180 } 181 if err != nil { 182 return nil, err 183 } 184 records = append(records, record) 185 } 186 } 187 188 // readRecord reads and parses a single csv record from r. 189 // Unlike parseRecord, readRecord handles FieldsPerRecord. 190 // If dst has enough capacity it will be used for the returned record. 191 func (r *Reader) readRecord(dst []string) (record []string, err error) { 192 for { 193 record, err = r.parseRecord(dst) 194 if record != nil { 195 break 196 } 197 if err != nil { 198 return nil, err 199 } 200 } 201 202 if r.FieldsPerRecord > 0 { 203 if len(record) != r.FieldsPerRecord { 204 r.column = 0 // report at start of record 205 return record, r.error(ErrFieldCount) 206 } 207 } else if r.FieldsPerRecord == 0 { 208 r.FieldsPerRecord = len(record) 209 } 210 return record, nil 211 } 212 213 // readRune reads one rune from r, folding \r\n to \n and keeping track 214 // of how far into the line we have read. r.column will point to the start 215 // of this rune, not the end of this rune. 216 func (r *Reader) readRune() (rune, error) { 217 r1, _, err := r.r.ReadRune() 218 219 // Handle \r\n here. We make the simplifying assumption that 220 // anytime \r is followed by \n that it can be folded to \n. 221 // We will not detect files which contain both \r\n and bare \n. 222 if r1 == '\r' { 223 r1, _, err = r.r.ReadRune() 224 if err == nil { 225 if r1 != '\n' { 226 r.r.UnreadRune() 227 r1 = '\r' 228 } 229 } 230 } 231 r.column++ 232 return r1, err 233 } 234 235 // skip reads runes up to and including the rune delim or until error. 236 func (r *Reader) skip(delim rune) error { 237 for { 238 r1, err := r.readRune() 239 if err != nil { 240 return err 241 } 242 if r1 == delim { 243 return nil 244 } 245 } 246 } 247 248 // parseRecord reads and parses a single csv record from r. 249 // If dst has enough capacity it will be used for the returned fields. 250 func (r *Reader) parseRecord(dst []string) (fields []string, err error) { 251 // Each record starts on a new line. We increment our line 252 // number (lines start at 1, not 0) and set column to -1 253 // so as we increment in readRune it points to the character we read. 254 r.line++ 255 r.column = -1 256 257 // Peek at the first rune. If it is an error we are done. 258 // If we support comments and it is the comment character 259 // then skip to the end of line. 260 261 r1, _, err := r.r.ReadRune() 262 if err != nil { 263 return nil, err 264 } 265 266 if r.Comment != 0 && r1 == r.Comment { 267 return nil, r.skip('\n') 268 } 269 r.r.UnreadRune() 270 271 r.lineBuffer.Reset() 272 r.fieldIndexes = r.fieldIndexes[:0] 273 274 // At this point we have at least one field. 275 for { 276 idx := r.lineBuffer.Len() 277 278 haveField, delim, err := r.parseField() 279 if haveField { 280 r.fieldIndexes = append(r.fieldIndexes, idx) 281 } 282 283 if delim == '\n' || err == io.EOF { 284 if len(r.fieldIndexes) == 0 { 285 return nil, err 286 } 287 break 288 } 289 290 if err != nil { 291 return nil, err 292 } 293 } 294 295 fieldCount := len(r.fieldIndexes) 296 // Using this approach (creating a single string and taking slices of it) 297 // means that a single reference to any of the fields will retain the whole 298 // string. The risk of a nontrivial space leak caused by this is considered 299 // minimal and a tradeoff for better performance through the combined 300 // allocations. 301 line := r.lineBuffer.String() 302 303 if cap(dst) >= fieldCount { 304 fields = dst[:fieldCount] 305 } else { 306 fields = make([]string, fieldCount) 307 } 308 309 for i, idx := range r.fieldIndexes { 310 if i == fieldCount-1 { 311 fields[i] = line[idx:] 312 } else { 313 fields[i] = line[idx:r.fieldIndexes[i+1]] 314 } 315 } 316 317 return fields, nil 318 } 319 320 // parseField parses the next field in the record. The read field is 321 // appended to r.lineBuffer. Delim is the first character not part of the field 322 // (r.Comma or '\n'). 323 func (r *Reader) parseField() (haveField bool, delim rune, err error) { 324 r1, err := r.readRune() 325 for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { 326 r1, err = r.readRune() 327 } 328 329 if err == io.EOF && r.column != 0 { 330 return true, 0, err 331 } 332 if err != nil { 333 return false, 0, err 334 } 335 336 switch r1 { 337 case r.Comma: 338 // will check below 339 340 case '\n': 341 // We are a trailing empty field or a blank line 342 if r.column == 0 { 343 return false, r1, nil 344 } 345 return true, r1, nil 346 347 case '"': 348 // quoted field 349 Quoted: 350 for { 351 r1, err = r.readRune() 352 if err != nil { 353 if err == io.EOF { 354 if r.LazyQuotes { 355 return true, 0, err 356 } 357 return false, 0, r.error(ErrQuote) 358 } 359 return false, 0, err 360 } 361 switch r1 { 362 case '"': 363 r1, err = r.readRune() 364 if err != nil || r1 == r.Comma { 365 break Quoted 366 } 367 if r1 == '\n' { 368 return true, r1, nil 369 } 370 if r1 != '"' { 371 if !r.LazyQuotes { 372 r.column-- 373 return false, 0, r.error(ErrQuote) 374 } 375 // accept the bare quote 376 r.lineBuffer.WriteRune('"') 377 } 378 case '\n': 379 r.line++ 380 r.column = -1 381 } 382 r.lineBuffer.WriteRune(r1) 383 } 384 385 default: 386 // unquoted field 387 for { 388 r.lineBuffer.WriteRune(r1) 389 r1, err = r.readRune() 390 if err != nil || r1 == r.Comma { 391 break 392 } 393 if r1 == '\n' { 394 return true, r1, nil 395 } 396 if !r.LazyQuotes && r1 == '"' { 397 return false, 0, r.error(ErrBareQuote) 398 } 399 } 400 } 401 402 if err != nil { 403 if err == io.EOF { 404 return true, 0, err 405 } 406 return false, 0, err 407 } 408 409 return true, r1, nil 410 }