github.com/mh-cbon/go@v0.0.0-20160603070303-9e112a3fe4c0/src/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // There are many kinds of CSV files; this package supports the format 7 // described in RFC 4180. 8 // 9 // A csv file contains zero or more records of one or more fields per record. 10 // Each record is separated by the newline character. The final record may 11 // optionally be followed by a newline character. 12 // 13 // field1,field2,field3 14 // 15 // White space is considered part of a field. 16 // 17 // Carriage returns before newline characters are silently removed. 18 // 19 // Blank lines are ignored. A line with only whitespace characters (excluding 20 // the ending newline character) is not considered a blank line. 21 // 22 // Fields which start and stop with the quote character " are called 23 // quoted-fields. The beginning and ending quote are not part of the 24 // field. 25 // 26 // The source: 27 // 28 // normal string,"quoted-field" 29 // 30 // results in the fields 31 // 32 // {`normal string`, `quoted-field`} 33 // 34 // Within a quoted-field a quote character followed by a second quote 35 // character is considered a single quote. 36 // 37 // "the ""word"" is true","a ""quoted-field""" 38 // 39 // results in 40 // 41 // {`the "word" is true`, `a "quoted-field"`} 42 // 43 // Newlines and commas may be included in a quoted-field 44 // 45 // "Multi-line 46 // field","comma is ," 47 // 48 // results in 49 // 50 // {`Multi-line 51 // field`, `comma is ,`} 52 package csv 53 54 import ( 55 "bufio" 56 "bytes" 57 "errors" 58 "fmt" 59 "io" 60 "unicode" 61 ) 62 63 // A ParseError is returned for parsing errors. 64 // The first line is 1. The first column is 0. 65 type ParseError struct { 66 Line int // Line where the error occurred 67 Column int // Column (rune index) where the error occurred 68 Err error // The actual error 69 } 70 71 func (e *ParseError) Error() string { 72 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) 73 } 74 75 // These are the errors that can be returned in ParseError.Error 76 var ( 77 ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used 78 ErrBareQuote = errors.New("bare \" in non-quoted-field") 79 ErrQuote = errors.New("extraneous \" in field") 80 ErrFieldCount = errors.New("wrong number of fields in line") 81 ) 82 83 // A Reader reads records from a CSV-encoded file. 84 // 85 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 86 // The exported fields can be changed to customize the details before the 87 // first call to Read or ReadAll. 88 // 89 // Comma is the field delimiter. It defaults to ','. 90 // 91 // Comment, if not 0, is the comment character. Lines beginning with the 92 // Comment character are ignored. 93 // 94 // If FieldsPerRecord is positive, Read requires each record to 95 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 96 // the number of fields in the first record, so that future records must 97 // have the same field count. If FieldsPerRecord is negative, no check is 98 // made and records may have a variable number of fields. 99 // 100 // If LazyQuotes is true, a quote may appear in an unquoted field and a 101 // non-doubled quote may appear in a quoted field. 102 // 103 // If TrimLeadingSpace is true, leading white space in a field is ignored. 104 // If the field delimiter is white space, TrimLeadingSpace will trim the 105 // delimiter. 106 type Reader struct { 107 Comma rune // field delimiter (set to ',' by NewReader) 108 Comment rune // comment character for start of line 109 FieldsPerRecord int // number of expected fields per record 110 LazyQuotes bool // allow lazy quotes 111 TrailingComma bool // ignored; here for backwards compatibility 112 TrimLeadingSpace bool // trim leading space 113 line int 114 column int 115 r *bufio.Reader 116 field bytes.Buffer 117 } 118 119 // NewReader returns a new Reader that reads from r. 120 func NewReader(r io.Reader) *Reader { 121 return &Reader{ 122 Comma: ',', 123 r: bufio.NewReader(r), 124 } 125 } 126 127 // error creates a new ParseError based on err. 128 func (r *Reader) error(err error) error { 129 return &ParseError{ 130 Line: r.line, 131 Column: r.column, 132 Err: err, 133 } 134 } 135 136 // Read reads one record from r. The record is a slice of strings with each 137 // string representing one field. 138 func (r *Reader) Read() (record []string, err error) { 139 for { 140 record, err = r.parseRecord() 141 if record != nil { 142 break 143 } 144 if err != nil { 145 return nil, err 146 } 147 } 148 149 if r.FieldsPerRecord > 0 { 150 if len(record) != r.FieldsPerRecord { 151 r.column = 0 // report at start of record 152 return record, r.error(ErrFieldCount) 153 } 154 } else if r.FieldsPerRecord == 0 { 155 r.FieldsPerRecord = len(record) 156 } 157 return record, nil 158 } 159 160 // ReadAll reads all the remaining records from r. 161 // Each record is a slice of fields. 162 // A successful call returns err == nil, not err == io.EOF. Because ReadAll is 163 // defined to read until EOF, it does not treat end of file as an error to be 164 // reported. 165 func (r *Reader) ReadAll() (records [][]string, err error) { 166 for { 167 record, err := r.Read() 168 if err == io.EOF { 169 return records, nil 170 } 171 if err != nil { 172 return nil, err 173 } 174 records = append(records, record) 175 } 176 } 177 178 // readRune reads one rune from r, folding \r\n to \n and keeping track 179 // of how far into the line we have read. r.column will point to the start 180 // of this rune, not the end of this rune. 181 func (r *Reader) readRune() (rune, error) { 182 r1, _, err := r.r.ReadRune() 183 184 // Handle \r\n here. We make the simplifying assumption that 185 // anytime \r is followed by \n that it can be folded to \n. 186 // We will not detect files which contain both \r\n and bare \n. 187 if r1 == '\r' { 188 r1, _, err = r.r.ReadRune() 189 if err == nil { 190 if r1 != '\n' { 191 r.r.UnreadRune() 192 r1 = '\r' 193 } 194 } 195 } 196 r.column++ 197 return r1, err 198 } 199 200 // skip reads runes up to and including the rune delim or until error. 201 func (r *Reader) skip(delim rune) error { 202 for { 203 r1, err := r.readRune() 204 if err != nil { 205 return err 206 } 207 if r1 == delim { 208 return nil 209 } 210 } 211 } 212 213 // parseRecord reads and parses a single csv record from r. 214 func (r *Reader) parseRecord() (fields []string, err error) { 215 // Each record starts on a new line. We increment our line 216 // number (lines start at 1, not 0) and set column to -1 217 // so as we increment in readRune it points to the character we read. 218 r.line++ 219 r.column = -1 220 221 // Peek at the first rune. If it is an error we are done. 222 // If we support comments and it is the comment character 223 // then skip to the end of line. 224 225 r1, _, err := r.r.ReadRune() 226 if err != nil { 227 return nil, err 228 } 229 230 if r.Comment != 0 && r1 == r.Comment { 231 return nil, r.skip('\n') 232 } 233 r.r.UnreadRune() 234 235 // At this point we have at least one field. 236 for { 237 haveField, delim, err := r.parseField() 238 if haveField { 239 // If FieldsPerRecord is greater than 0 we can assume the final 240 // length of fields to be equal to FieldsPerRecord. 241 if r.FieldsPerRecord > 0 && fields == nil { 242 fields = make([]string, 0, r.FieldsPerRecord) 243 } 244 fields = append(fields, r.field.String()) 245 } 246 if delim == '\n' || err == io.EOF { 247 return fields, err 248 } else if err != nil { 249 return nil, err 250 } 251 } 252 } 253 254 // parseField parses the next field in the record. The read field is 255 // located in r.field. Delim is the first character not part of the field 256 // (r.Comma or '\n'). 257 func (r *Reader) parseField() (haveField bool, delim rune, err error) { 258 r.field.Reset() 259 260 r1, err := r.readRune() 261 for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) { 262 r1, err = r.readRune() 263 } 264 265 if err == io.EOF && r.column != 0 { 266 return true, 0, err 267 } 268 if err != nil { 269 return false, 0, err 270 } 271 272 switch r1 { 273 case r.Comma: 274 // will check below 275 276 case '\n': 277 // We are a trailing empty field or a blank line 278 if r.column == 0 { 279 return false, r1, nil 280 } 281 return true, r1, nil 282 283 case '"': 284 // quoted field 285 Quoted: 286 for { 287 r1, err = r.readRune() 288 if err != nil { 289 if err == io.EOF { 290 if r.LazyQuotes { 291 return true, 0, err 292 } 293 return false, 0, r.error(ErrQuote) 294 } 295 return false, 0, err 296 } 297 switch r1 { 298 case '"': 299 r1, err = r.readRune() 300 if err != nil || r1 == r.Comma { 301 break Quoted 302 } 303 if r1 == '\n' { 304 return true, r1, nil 305 } 306 if r1 != '"' { 307 if !r.LazyQuotes { 308 r.column-- 309 return false, 0, r.error(ErrQuote) 310 } 311 // accept the bare quote 312 r.field.WriteRune('"') 313 } 314 case '\n': 315 r.line++ 316 r.column = -1 317 } 318 r.field.WriteRune(r1) 319 } 320 321 default: 322 // unquoted field 323 for { 324 r.field.WriteRune(r1) 325 r1, err = r.readRune() 326 if err != nil || r1 == r.Comma { 327 break 328 } 329 if r1 == '\n' { 330 return true, r1, nil 331 } 332 if !r.LazyQuotes && r1 == '"' { 333 return false, 0, r.error(ErrBareQuote) 334 } 335 } 336 } 337 338 if err != nil { 339 if err == io.EOF { 340 return true, 0, err 341 } 342 return false, 0, err 343 } 344 345 return true, r1, nil 346 }