github.com/euank/go@v0.0.0-20160829210321-495514729181/src/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // There are many kinds of CSV files; this package supports the format 7 // described in RFC 4180. 8 // 9 // A csv file contains zero or more records of one or more fields per record. 10 // Each record is separated by the newline character. The final record may 11 // optionally be followed by a newline character. 12 // 13 // field1,field2,field3 14 // 15 // White space is considered part of a field. 16 // 17 // Carriage returns before newline characters are silently removed. 18 // 19 // Blank lines are ignored. A line with only whitespace characters (excluding 20 // the ending newline character) is not considered a blank line. 21 // 22 // Fields which start and stop with the quote character " are called 23 // quoted-fields. The beginning and ending quote are not part of the 24 // field. 25 // 26 // The source: 27 // 28 // normal string,"quoted-field" 29 // 30 // results in the fields 31 // 32 // {`normal string`, `quoted-field`} 33 // 34 // Within a quoted-field a quote character followed by a second quote 35 // character is considered a single quote. 36 // 37 // "the ""word"" is true","a ""quoted-field""" 38 // 39 // results in 40 // 41 // {`the "word" is true`, `a "quoted-field"`} 42 // 43 // Newlines and commas may be included in a quoted-field 44 // 45 // "Multi-line 46 // field","comma is ," 47 // 48 // results in 49 // 50 // {`Multi-line 51 // field`, `comma is ,`} 52 package csv 53 54 import ( 55 "bufio" 56 "bytes" 57 "errors" 58 "fmt" 59 "io" 60 "unicode" 61 ) 62 63 // A ParseError is returned for parsing errors. 64 // The first line is 1. The first column is 0. 65 type ParseError struct { 66 Line int // Line where the error occurred 67 Column int // Column (rune index) where the error occurred 68 Err error // The actual error 69 } 70 71 func (e *ParseError) Error() string { 72 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) 73 } 74 75 // These are the errors that can be returned in ParseError.Error 76 var ( 77 ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used 78 ErrBareQuote = errors.New("bare \" in non-quoted-field") 79 ErrQuote = errors.New("extraneous \" in field") 80 ErrFieldCount = errors.New("wrong number of fields in line") 81 ) 82 83 // A Reader reads records from a CSV-encoded file. 84 // 85 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 86 // The exported fields can be changed to customize the details before the 87 // first call to Read or ReadAll. 88 // 89 // 90 type Reader struct { 91 // Comma is the field delimiter. 92 // It is set to comma (',') by NewReader. 93 Comma rune 94 // Comment, if not 0, is the comment character. Lines beginning with the 95 // Comment character without preceding whitespace are ignored. 96 // With leading whitespace the Comment character becomes part of the 97 // field, even if TrimLeadingSpace is true. 98 Comment rune 99 // FieldsPerRecord is the number of expected fields per record. 100 // If FieldsPerRecord is positive, Read requires each record to 101 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 102 // the number of fields in the first record, so that future records must 103 // have the same field count. If FieldsPerRecord is negative, no check is 104 // made and records may have a variable number of fields. 105 FieldsPerRecord int 106 // If LazyQuotes is true, a quote may appear in an unquoted field and a 107 // non-doubled quote may appear in a quoted field. 108 LazyQuotes bool 109 TrailingComma bool // ignored; here for backwards compatibility 110 // If TrimLeadingSpace is true, leading white space in a field is ignored. 111 // This is done even if the field delimiter, Comma, is white space. 112 TrimLeadingSpace bool 113 114 line int 115 column int 116 r *bufio.Reader 117 field bytes.Buffer 118 } 119 120 // NewReader returns a new Reader that reads from r. 121 func NewReader(r io.Reader) *Reader { 122 return &Reader{ 123 Comma: ',', 124 r: bufio.NewReader(r), 125 } 126 } 127 128 // error creates a new ParseError based on err. 129 func (r *Reader) error(err error) error { 130 return &ParseError{ 131 Line: r.line, 132 Column: r.column, 133 Err: err, 134 } 135 } 136 137 // Read reads one record from r. The record is a slice of strings with each 138 // string representing one field. 139 func (r *Reader) Read() (record []string, err error) { 140 for { 141 record, err = r.parseRecord() 142 if record != nil { 143 break 144 } 145 if err != nil { 146 return nil, err 147 } 148 } 149 150 if r.FieldsPerRecord > 0 { 151 if len(record) != r.FieldsPerRecord { 152 r.column = 0 // report at start of record 153 return record, r.error(ErrFieldCount) 154 } 155 } else if r.FieldsPerRecord == 0 { 156 r.FieldsPerRecord = len(record) 157 } 158 return record, nil 159 } 160 161 // ReadAll reads all the remaining records from r. 162 // Each record is a slice of fields. 163 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 164 // defined to read until EOF, it does not treat end of file as an error to be 165 // reported. 166 func (r *Reader) ReadAll() (records [][]string, err error) { 167 for { 168 record, err := r.Read() 169 if err == io.EOF { 170 return records, nil 171 } 172 if err != nil { 173 return nil, err 174 } 175 records = append(records, record) 176 } 177 } 178 179 // readRune reads one rune from r, folding \r\n to \n and keeping track 180 // of how far into the line we have read. r.column will point to the start 181 // of this rune, not the end of this rune. 182 func (r *Reader) readRune() (rune, error) { 183 r1, _, err := r.r.ReadRune() 184 185 // Handle \r\n here. We make the simplifying assumption that 186 // anytime \r is followed by \n that it can be folded to \n. 187 // We will not detect files which contain both \r\n and bare \n. 188 if r1 == '\r' { 189 r1, _, err = r.r.ReadRune() 190 if err == nil { 191 if r1 != '\n' { 192 r.r.UnreadRune() 193 r1 = '\r' 194 } 195 } 196 } 197 r.column++ 198 return r1, err 199 } 200 201 // skip reads runes up to and including the rune delim or until error. 202 func (r *Reader) skip(delim rune) error { 203 for { 204 r1, err := r.readRune() 205 if err != nil { 206 return err 207 } 208 if r1 == delim { 209 return nil 210 } 211 } 212 } 213 214 // parseRecord reads and parses a single csv record from r. 215 func (r *Reader) parseRecord() (fields []string, err error) { 216 // Each record starts on a new line. We increment our line 217 // number (lines start at 1, not 0) and set column to -1 218 // so as we increment in readRune it points to the character we read. 219 r.line++ 220 r.column = -1 221 222 // Peek at the first rune. If it is an error we are done. 223 // If we support comments and it is the comment character 224 // then skip to the end of line. 225 226 r1, _, err := r.r.ReadRune() 227 if err != nil { 228 return nil, err 229 } 230 231 if r.Comment != 0 && r1 == r.Comment { 232 return nil, r.skip('\n') 233 } 234 r.r.UnreadRune() 235 236 // At this point we have at least one field. 237 for { 238 haveField, delim, err := r.parseField() 239 if haveField { 240 // If FieldsPerRecord is greater than 0 we can assume the final 241 // length of fields to be equal to FieldsPerRecord. 242 if r.FieldsPerRecord > 0 && fields == nil { 243 fields = make([]string, 0, r.FieldsPerRecord) 244 } 245 fields = append(fields, r.field.String()) 246 } 247 if delim == '\n' || err == io.EOF { 248 return fields, err 249 } else if err != nil { 250 return nil, err 251 } 252 } 253 } 254 255 // parseField parses the next field in the record. The read field is 256 // located in r.field. Delim is the first character not part of the field 257 // (r.Comma or '\n'). 258 func (r *Reader) parseField() (haveField bool, delim rune, err error) { 259 r.field.Reset() 260 261 r1, err := r.readRune() 262 for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { 263 r1, err = r.readRune() 264 } 265 266 if err == io.EOF && r.column != 0 { 267 return true, 0, err 268 } 269 if err != nil { 270 return false, 0, err 271 } 272 273 switch r1 { 274 case r.Comma: 275 // will check below 276 277 case '\n': 278 // We are a trailing empty field or a blank line 279 if r.column == 0 { 280 return false, r1, nil 281 } 282 return true, r1, nil 283 284 case '"': 285 // quoted field 286 Quoted: 287 for { 288 r1, err = r.readRune() 289 if err != nil { 290 if err == io.EOF { 291 if r.LazyQuotes { 292 return true, 0, err 293 } 294 return false, 0, r.error(ErrQuote) 295 } 296 return false, 0, err 297 } 298 switch r1 { 299 case '"': 300 r1, err = r.readRune() 301 if err != nil || r1 == r.Comma { 302 break Quoted 303 } 304 if r1 == '\n' { 305 return true, r1, nil 306 } 307 if r1 != '"' { 308 if !r.LazyQuotes { 309 r.column-- 310 return false, 0, r.error(ErrQuote) 311 } 312 // accept the bare quote 313 r.field.WriteRune('"') 314 } 315 case '\n': 316 r.line++ 317 r.column = -1 318 } 319 r.field.WriteRune(r1) 320 } 321 322 default: 323 // unquoted field 324 for { 325 r.field.WriteRune(r1) 326 r1, err = r.readRune() 327 if err != nil || r1 == r.Comma { 328 break 329 } 330 if r1 == '\n' { 331 return true, r1, nil 332 } 333 if !r.LazyQuotes && r1 == '"' { 334 return false, 0, r.error(ErrBareQuote) 335 } 336 } 337 } 338 339 if err != nil { 340 if err == io.EOF { 341 return true, 0, err 342 } 343 return false, 0, err 344 } 345 346 return true, r1, nil 347 }