github.com/rakyll/go@v0.0.0-20170216000551-64c02460d703/src/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // There are many kinds of CSV files; this package supports the format 7 // described in RFC 4180. 8 // 9 // A csv file contains zero or more records of one or more fields per record. 10 // Each record is separated by the newline character. The final record may 11 // optionally be followed by a newline character. 12 // 13 // field1,field2,field3 14 // 15 // White space is considered part of a field. 16 // 17 // Carriage returns before newline characters are silently removed. 18 // 19 // Blank lines are ignored. A line with only whitespace characters (excluding 20 // the ending newline character) is not considered a blank line. 21 // 22 // Fields which start and stop with the quote character " are called 23 // quoted-fields. The beginning and ending quote are not part of the 24 // field. 25 // 26 // The source: 27 // 28 // normal string,"quoted-field" 29 // 30 // results in the fields 31 // 32 // {`normal string`, `quoted-field`} 33 // 34 // Within a quoted-field a quote character followed by a second quote 35 // character is considered a single quote. 36 // 37 // "the ""word"" is true","a ""quoted-field""" 38 // 39 // results in 40 // 41 // {`the "word" is true`, `a "quoted-field"`} 42 // 43 // Newlines and commas may be included in a quoted-field 44 // 45 // "Multi-line 46 // field","comma is ," 47 // 48 // results in 49 // 50 // {`Multi-line 51 // field`, `comma is ,`} 52 package csv 53 54 import ( 55 "bufio" 56 "bytes" 57 "errors" 58 "fmt" 59 "io" 60 "unicode" 61 ) 62 63 // A ParseError is returned for parsing errors. 64 // The first line is 1. The first column is 0. 65 type ParseError struct { 66 Line int // Line where the error occurred 67 Column int // Column (rune index) where the error occurred 68 Err error // The actual error 69 } 70 71 func (e *ParseError) Error() string { 72 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) 73 } 74 75 // These are the errors that can be returned in ParseError.Error 76 var ( 77 ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used 78 ErrBareQuote = errors.New("bare \" in non-quoted-field") 79 ErrQuote = errors.New("extraneous \" in field") 80 ErrFieldCount = errors.New("wrong number of fields in line") 81 ) 82 83 // A Reader reads records from a CSV-encoded file. 84 // 85 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 86 // The exported fields can be changed to customize the details before the 87 // first call to Read or ReadAll. 88 // 89 // 90 type Reader struct { 91 // Comma is the field delimiter. 92 // It is set to comma (',') by NewReader. 93 Comma rune 94 // Comment, if not 0, is the comment character. Lines beginning with the 95 // Comment character without preceding whitespace are ignored. 96 // With leading whitespace the Comment character becomes part of the 97 // field, even if TrimLeadingSpace is true. 98 Comment rune 99 // FieldsPerRecord is the number of expected fields per record. 100 // If FieldsPerRecord is positive, Read requires each record to 101 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 102 // the number of fields in the first record, so that future records must 103 // have the same field count. If FieldsPerRecord is negative, no check is 104 // made and records may have a variable number of fields. 105 FieldsPerRecord int 106 // If LazyQuotes is true, a quote may appear in an unquoted field and a 107 // non-doubled quote may appear in a quoted field. 108 LazyQuotes bool 109 TrailingComma bool // ignored; here for backwards compatibility 110 // If TrimLeadingSpace is true, leading white space in a field is ignored. 111 // This is done even if the field delimiter, Comma, is white space. 112 TrimLeadingSpace bool 113 114 line int 115 column int 116 r *bufio.Reader 117 // lineBuffer holds the unescaped fields read by readField, one after another. 118 // The fields can be accessed by using the indexes in fieldIndexes. 119 // Example: for the row `a,"b","c""d",e` lineBuffer will contain `abc"de` and 120 // fieldIndexes will contain the indexes 0, 1, 2, 5. 121 lineBuffer bytes.Buffer 122 // Indexes of fields inside lineBuffer 123 // The i'th field starts at offset fieldIndexes[i] in lineBuffer. 124 fieldIndexes []int 125 } 126 127 // NewReader returns a new Reader that reads from r. 128 func NewReader(r io.Reader) *Reader { 129 return &Reader{ 130 Comma: ',', 131 r: bufio.NewReader(r), 132 } 133 } 134 135 // error creates a new ParseError based on err. 136 func (r *Reader) error(err error) error { 137 return &ParseError{ 138 Line: r.line, 139 Column: r.column, 140 Err: err, 141 } 142 } 143 144 // Read reads one record (a slice of fields) from r. 145 // If the record has an unexpected number of fields, 146 // Read returns the record along with the error ErrFieldCount. 147 // Except for that case, Read always returns either a non-nil 148 // record or a non-nil error, but not both. 149 // If there is no data left to be read, Read returns nil, io.EOF. 150 func (r *Reader) Read() (record []string, err error) { 151 for { 152 record, err = r.parseRecord() 153 if record != nil { 154 break 155 } 156 if err != nil { 157 return nil, err 158 } 159 } 160 161 if r.FieldsPerRecord > 0 { 162 if len(record) != r.FieldsPerRecord { 163 r.column = 0 // report at start of record 164 return record, r.error(ErrFieldCount) 165 } 166 } else if r.FieldsPerRecord == 0 { 167 r.FieldsPerRecord = len(record) 168 } 169 return record, nil 170 } 171 172 // ReadAll reads all the remaining records from r. 173 // Each record is a slice of fields. 174 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 175 // defined to read until EOF, it does not treat end of file as an error to be 176 // reported. 177 func (r *Reader) ReadAll() (records [][]string, err error) { 178 for { 179 record, err := r.Read() 180 if err == io.EOF { 181 return records, nil 182 } 183 if err != nil { 184 return nil, err 185 } 186 records = append(records, record) 187 } 188 } 189 190 // readRune reads one rune from r, folding \r\n to \n and keeping track 191 // of how far into the line we have read. r.column will point to the start 192 // of this rune, not the end of this rune. 193 func (r *Reader) readRune() (rune, error) { 194 r1, _, err := r.r.ReadRune() 195 196 // Handle \r\n here. We make the simplifying assumption that 197 // anytime \r is followed by \n that it can be folded to \n. 198 // We will not detect files which contain both \r\n and bare \n. 199 if r1 == '\r' { 200 r1, _, err = r.r.ReadRune() 201 if err == nil { 202 if r1 != '\n' { 203 r.r.UnreadRune() 204 r1 = '\r' 205 } 206 } 207 } 208 r.column++ 209 return r1, err 210 } 211 212 // skip reads runes up to and including the rune delim or until error. 213 func (r *Reader) skip(delim rune) error { 214 for { 215 r1, err := r.readRune() 216 if err != nil { 217 return err 218 } 219 if r1 == delim { 220 return nil 221 } 222 } 223 } 224 225 // parseRecord reads and parses a single csv record from r. 226 func (r *Reader) parseRecord() (fields []string, err error) { 227 // Each record starts on a new line. We increment our line 228 // number (lines start at 1, not 0) and set column to -1 229 // so as we increment in readRune it points to the character we read. 230 r.line++ 231 r.column = -1 232 233 // Peek at the first rune. If it is an error we are done. 234 // If we support comments and it is the comment character 235 // then skip to the end of line. 236 237 r1, _, err := r.r.ReadRune() 238 if err != nil { 239 return nil, err 240 } 241 242 if r.Comment != 0 && r1 == r.Comment { 243 return nil, r.skip('\n') 244 } 245 r.r.UnreadRune() 246 247 r.lineBuffer.Reset() 248 r.fieldIndexes = r.fieldIndexes[:0] 249 250 // At this point we have at least one field. 251 for { 252 idx := r.lineBuffer.Len() 253 254 haveField, delim, err := r.parseField() 255 if haveField { 256 r.fieldIndexes = append(r.fieldIndexes, idx) 257 } 258 259 if delim == '\n' || err == io.EOF { 260 if len(r.fieldIndexes) == 0 { 261 return nil, err 262 } 263 break 264 } 265 266 if err != nil { 267 return nil, err 268 } 269 } 270 271 fieldCount := len(r.fieldIndexes) 272 // Using this approach (creating a single string and taking slices of it) 273 // means that a single reference to any of the fields will retain the whole 274 // string. The risk of a nontrivial space leak caused by this is considered 275 // minimal and a tradeoff for better performance through the combined 276 // allocations. 277 line := r.lineBuffer.String() 278 fields = make([]string, fieldCount) 279 280 for i, idx := range r.fieldIndexes { 281 if i == fieldCount-1 { 282 fields[i] = line[idx:] 283 } else { 284 fields[i] = line[idx:r.fieldIndexes[i+1]] 285 } 286 } 287 288 return fields, nil 289 } 290 291 // parseField parses the next field in the record. The read field is 292 // appended to r.lineBuffer. Delim is the first character not part of the field 293 // (r.Comma or '\n'). 294 func (r *Reader) parseField() (haveField bool, delim rune, err error) { 295 r1, err := r.readRune() 296 for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { 297 r1, err = r.readRune() 298 } 299 300 if err == io.EOF && r.column != 0 { 301 return true, 0, err 302 } 303 if err != nil { 304 return false, 0, err 305 } 306 307 switch r1 { 308 case r.Comma: 309 // will check below 310 311 case '\n': 312 // We are a trailing empty field or a blank line 313 if r.column == 0 { 314 return false, r1, nil 315 } 316 return true, r1, nil 317 318 case '"': 319 // quoted field 320 Quoted: 321 for { 322 r1, err = r.readRune() 323 if err != nil { 324 if err == io.EOF { 325 if r.LazyQuotes { 326 return true, 0, err 327 } 328 return false, 0, r.error(ErrQuote) 329 } 330 return false, 0, err 331 } 332 switch r1 { 333 case '"': 334 r1, err = r.readRune() 335 if err != nil || r1 == r.Comma { 336 break Quoted 337 } 338 if r1 == '\n' { 339 return true, r1, nil 340 } 341 if r1 != '"' { 342 if !r.LazyQuotes { 343 r.column-- 344 return false, 0, r.error(ErrQuote) 345 } 346 // accept the bare quote 347 r.lineBuffer.WriteRune('"') 348 } 349 case '\n': 350 r.line++ 351 r.column = -1 352 } 353 r.lineBuffer.WriteRune(r1) 354 } 355 356 default: 357 // unquoted field 358 for { 359 r.lineBuffer.WriteRune(r1) 360 r1, err = r.readRune() 361 if err != nil || r1 == r.Comma { 362 break 363 } 364 if r1 == '\n' { 365 return true, r1, nil 366 } 367 if !r.LazyQuotes && r1 == '"' { 368 return false, 0, r.error(ErrBareQuote) 369 } 370 } 371 } 372 373 if err != nil { 374 if err == io.EOF { 375 return true, 0, err 376 } 377 return false, 0, err 378 } 379 380 return true, r1, nil 381 }