github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/pkg/encoding/csv/reader.go (about) 1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package csv reads and writes comma-separated values (CSV) files. 6 // 7 // A csv file contains zero or more records of one or more fields per record. 8 // Each record is separated by the newline character. The final record may 9 // optionally be followed by a newline character. 10 // 11 // field1,field2,field3 12 // 13 // White space is considered part of a field. 14 // 15 // Carriage returns before newline characters are silently removed. 16 // 17 // Blank lines are ignored. A line with only whitespace characters (excluding 18 // the ending newline character) is not considered a blank line. 19 // 20 // Fields which start and stop with the quote character " are called 21 // quoted-fields. The beginning and ending quote are not part of the 22 // field. 23 // 24 // The source: 25 // 26 // normal string,"quoted-field" 27 // 28 // results in the fields 29 // 30 // {`normal string`, `quoted-field`} 31 // 32 // Within a quoted-field a quote character followed by a second quote 33 // character is considered a single quote. 34 // 35 // "the ""word"" is true","a ""quoted-field""" 36 // 37 // results in 38 // 39 // {`the "word" is true`, `a "quoted-field"`} 40 // 41 // Newlines and commas may be included in a quoted-field 42 // 43 // "Multi-line 44 // field","comma is ," 45 // 46 // results in 47 // 48 // {`Multi-line 49 // field`, `comma is ,`} 50 package csv 51 52 import ( 53 "bufio" 54 "bytes" 55 "errors" 56 "fmt" 57 "io" 58 "unicode" 59 ) 60 61 // A ParseError is returned for parsing errors. 62 // The first line is 1. The first column is 0. 63 type ParseError struct { 64 Line int // Line where the error occurred 65 Column int // Column (rune index) where the error occurred 66 Err error // The actual error 67 } 68 69 func (e *ParseError) Error() string { 70 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err) 71 } 72 73 // These are the errors that can be returned in ParseError.Error 74 var ( 75 ErrTrailingComma = errors.New("extra delimiter at end of line") 76 ErrBareQuote = errors.New("bare \" in non-quoted-field") 77 ErrQuote = errors.New("extraneous \" in field") 78 ErrFieldCount = errors.New("wrong number of fields in line") 79 ) 80 81 // A Reader reads records from a CSV-encoded file. 82 // 83 // As returned by NewReader, a Reader expects input conforming to RFC 4180. 84 // The exported fields can be changed to customize the details before the 85 // first call to Read or ReadAll. 86 // 87 // Comma is the field delimiter. It defaults to ','. 88 // 89 // Comment, if not 0, is the comment character. Lines beginning with the 90 // Comment character are ignored. 91 // 92 // If FieldsPerRecord is positive, Read requires each record to 93 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to 94 // the number of fields in the first record, so that future records must 95 // have the same field count. If FieldsPerRecord is negative, no check is 96 // made and records may have a variable number of fields. 97 // 98 // If LazyQuotes is true, a quote may appear in an unquoted field and a 99 // non-doubled quote may appear in a quoted field. 100 // 101 // If TrailingComma is true, the last field may be an unquoted empty field. 102 // 103 // If TrimLeadingSpace is true, leading white space in a field is ignored. 104 type Reader struct { 105 Comma rune // Field delimiter (set to ',' by NewReader) 106 Comment rune // Comment character for start of line 107 FieldsPerRecord int // Number of expected fields per record 108 LazyQuotes bool // Allow lazy quotes 109 TrailingComma bool // Allow trailing comma 110 TrimLeadingSpace bool // Trim leading space 111 line int 112 column int 113 r *bufio.Reader 114 field bytes.Buffer 115 } 116 117 // NewReader returns a new Reader that reads from r. 118 func NewReader(r io.Reader) *Reader { 119 return &Reader{ 120 Comma: ',', 121 r: bufio.NewReader(r), 122 } 123 } 124 125 // error creates a new ParseError based on err. 126 func (r *Reader) error(err error) error { 127 return &ParseError{ 128 Line: r.line, 129 Column: r.column, 130 Err: err, 131 } 132 } 133 134 // Read reads one record from r. The record is a slice of strings with each 135 // string representing one field. 136 func (r *Reader) Read() (record []string, err error) { 137 for { 138 record, err = r.parseRecord() 139 if record != nil { 140 break 141 } 142 if err != nil { 143 return nil, err 144 } 145 } 146 147 if r.FieldsPerRecord > 0 { 148 if len(record) != r.FieldsPerRecord { 149 r.column = 0 // report at start of record 150 return record, r.error(ErrFieldCount) 151 } 152 } else if r.FieldsPerRecord == 0 { 153 r.FieldsPerRecord = len(record) 154 } 155 return record, nil 156 } 157 158 // ReadAll reads all the remaining records from r. 159 // Each record is a slice of fields. 160 // A successful call returns err == nil, not err == EOF. Because ReadAll is 161 // defined to read until EOF, it does not treat end of file as an error to be 162 // reported. 163 func (r *Reader) ReadAll() (records [][]string, err error) { 164 for { 165 record, err := r.Read() 166 if err == io.EOF { 167 return records, nil 168 } 169 if err != nil { 170 return nil, err 171 } 172 records = append(records, record) 173 } 174 } 175 176 // readRune reads one rune from r, folding \r\n to \n and keeping track 177 // of how far into the line we have read. r.column will point to the start 178 // of this rune, not the end of this rune. 179 func (r *Reader) readRune() (rune, error) { 180 r1, _, err := r.r.ReadRune() 181 182 // Handle \r\n here. We make the simplifying assumption that 183 // anytime \r is followed by \n that it can be folded to \n. 184 // We will not detect files which contain both \r\n and bare \n. 185 if r1 == '\r' { 186 r1, _, err = r.r.ReadRune() 187 if err == nil { 188 if r1 != '\n' { 189 r.r.UnreadRune() 190 r1 = '\r' 191 } 192 } 193 } 194 r.column++ 195 return r1, err 196 } 197 198 // unreadRune puts the last rune read from r back. 199 func (r *Reader) unreadRune() { 200 r.r.UnreadRune() 201 r.column-- 202 } 203 204 // skip reads runes up to and including the rune delim or until error. 205 func (r *Reader) skip(delim rune) error { 206 for { 207 r1, err := r.readRune() 208 if err != nil { 209 return err 210 } 211 if r1 == delim { 212 return nil 213 } 214 } 215 } 216 217 // parseRecord reads and parses a single csv record from r. 218 func (r *Reader) parseRecord() (fields []string, err error) { 219 // Each record starts on a new line. We increment our line 220 // number (lines start at 1, not 0) and set column to -1 221 // so as we increment in readRune it points to the character we read. 222 r.line++ 223 r.column = -1 224 225 // Peek at the first rune. If it is an error we are done. 226 // If we are support comments and it is the comment character 227 // then skip to the end of line. 228 229 r1, _, err := r.r.ReadRune() 230 if err != nil { 231 return nil, err 232 } 233 234 if r.Comment != 0 && r1 == r.Comment { 235 return nil, r.skip('\n') 236 } 237 r.r.UnreadRune() 238 239 // At this point we have at least one field. 240 for { 241 haveField, delim, err := r.parseField() 242 if haveField { 243 fields = append(fields, r.field.String()) 244 } 245 if delim == '\n' || err == io.EOF { 246 return fields, err 247 } else if err != nil { 248 return nil, err 249 } 250 } 251 } 252 253 // parseField parses the next field in the record. The read field is 254 // located in r.field. Delim is the first character not part of the field 255 // (r.Comma or '\n'). 256 func (r *Reader) parseField() (haveField bool, delim rune, err error) { 257 r.field.Reset() 258 259 r1, err := r.readRune() 260 if err != nil { 261 // If we have EOF and are not at the start of a line 262 // then we return the empty field. We have already 263 // checked for trailing commas if needed. 264 if err == io.EOF && r.column != 0 { 265 return true, 0, err 266 } 267 return false, 0, err 268 } 269 270 if r.TrimLeadingSpace { 271 for r1 != '\n' && unicode.IsSpace(r1) { 272 r1, err = r.readRune() 273 if err != nil { 274 return false, 0, err 275 } 276 } 277 } 278 279 switch r1 { 280 case r.Comma: 281 // will check below 282 283 case '\n': 284 // We are a trailing empty field or a blank line 285 if r.column == 0 { 286 return false, r1, nil 287 } 288 return true, r1, nil 289 290 case '"': 291 // quoted field 292 Quoted: 293 for { 294 r1, err = r.readRune() 295 if err != nil { 296 if err == io.EOF { 297 if r.LazyQuotes { 298 return true, 0, err 299 } 300 return false, 0, r.error(ErrQuote) 301 } 302 return false, 0, err 303 } 304 switch r1 { 305 case '"': 306 r1, err = r.readRune() 307 if err != nil || r1 == r.Comma { 308 break Quoted 309 } 310 if r1 == '\n' { 311 return true, r1, nil 312 } 313 if r1 != '"' { 314 if !r.LazyQuotes { 315 r.column-- 316 return false, 0, r.error(ErrQuote) 317 } 318 // accept the bare quote 319 r.field.WriteRune('"') 320 } 321 case '\n': 322 r.line++ 323 r.column = -1 324 } 325 r.field.WriteRune(r1) 326 } 327 328 default: 329 // unquoted field 330 for { 331 r.field.WriteRune(r1) 332 r1, err = r.readRune() 333 if err != nil || r1 == r.Comma { 334 break 335 } 336 if r1 == '\n' { 337 return true, r1, nil 338 } 339 if !r.LazyQuotes && r1 == '"' { 340 return false, 0, r.error(ErrBareQuote) 341 } 342 } 343 } 344 345 if err != nil { 346 if err == io.EOF { 347 return true, 0, err 348 } 349 return false, 0, err 350 } 351 352 if !r.TrailingComma { 353 // We don't allow trailing commas. See if we 354 // are at the end of the line (being mindful 355 // of trimming spaces). 356 c := r.column 357 r1, err = r.readRune() 358 if r.TrimLeadingSpace { 359 for r1 != '\n' && unicode.IsSpace(r1) { 360 r1, err = r.readRune() 361 if err != nil { 362 break 363 } 364 } 365 } 366 if err == io.EOF || r1 == '\n' { 367 r.column = c // report the comma 368 return false, 0, r.error(ErrTrailingComma) 369 } 370 r.unreadRune() 371 } 372 return true, r1, nil 373 }