github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/util/encoding/csv/reader.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  // Copyright 2011 The Go Authors. All rights reserved.
    12  // Use of this source code is governed by a BSD-style
    13  // license that can be found in licenses/BSD-golang.txt.
    14  
    15  // Package csv reads and writes comma-separated values (CSV) files.
    16  // There are many kinds of CSV files; this package supports the format
    17  // described in RFC 4180.
    18  //
    19  // A csv file contains zero or more records of one or more fields per record.
    20  // Each record is separated by the newline character. The final record may
    21  // optionally be followed by a newline character.
    22  //
    23  //	field1,field2,field3
    24  //
    25  // White space is considered part of a field.
    26  //
    27  // Carriage returns before newline characters are silently removed.
    28  //
    29  // Blank lines are ignored. A line with only whitespace characters (excluding
    30  // the ending newline character) is not considered a blank line.
    31  //
    32  // Fields which start and stop with the quote character " are called
    33  // quoted-fields. The beginning and ending quote are not part of the
    34  // field.
    35  //
    36  // The source:
    37  //
    38  //	normal string,"quoted-field"
    39  //
    40  // results in the fields
    41  //
    42  //	{`normal string`, `quoted-field`}
    43  //
    44  // Within a quoted-field a quote character followed by a second quote
    45  // character is considered a single quote.
    46  //
    47  //	"the ""word"" is true","a ""quoted-field"""
    48  //
    49  // results in
    50  //
    51  //	{`the "word" is true`, `a "quoted-field"`}
    52  //
    53  // Newlines and commas may be included in a quoted-field
    54  //
    55  //	"Multi-line
    56  //	field","comma is ,"
    57  //
    58  // results in
    59  //
    60  //	{`Multi-line
    61  //	field`, `comma is ,`}
    62  package csv
    63  
    64  import (
    65  	"bufio"
    66  	"bytes"
    67  	"fmt"
    68  	"io"
    69  	"unicode"
    70  	"unicode/utf8"
    71  
    72  	"github.com/cockroachdb/errors"
    73  )
    74  
    75  // A ParseError is returned for parsing errors.
    76  // Line numbers are 1-indexed and columns are 0-indexed.
    77  type ParseError struct {
    78  	StartLine int   // Line where the record starts
    79  	Line      int   // Line where the error occurred
    80  	Column    int   // Column (rune index) where the error occurred
    81  	Err       error // The actual error
    82  }
    83  
    84  var _ error = (*ParseError)(nil)
    85  var _ fmt.Formatter = (*ParseError)(nil)
    86  var _ errors.Formatter = (*ParseError)(nil)
    87  
    88  // Error implements error.
    89  func (e *ParseError) Error() string { return fmt.Sprintf("%v", e) }
    90  
    91  // Cause implements causer.
    92  func (e *ParseError) Cause() error { return e.Err }
    93  
    94  // Format implements fmt.Formatter.
    95  func (e *ParseError) Format(s fmt.State, verb rune) { errors.FormatError(e, s, verb) }
    96  
    97  // FormatError implements errors.Formatter.
    98  func (e *ParseError) FormatError(p errors.Printer) error {
    99  	if errors.Is(e.Err, ErrFieldCount) {
   100  		p.Printf("record on line %d", e.Line)
   101  	} else if e.StartLine != e.Line {
   102  		p.Printf("record on line %d; parse error on line %d, column %d", e.StartLine, e.Line, e.Column)
   103  	} else {
   104  		p.Printf("parse error on line %d, column %d", e.Line, e.Column)
   105  	}
   106  	return e.Err
   107  }
   108  
   109  // These are the errors that can be returned in ParseError.Err.
   110  var (
   111  	ErrBareQuote  = errors.New("bare \" in non-quoted-field")
   112  	ErrQuote      = errors.New("extraneous or missing \" in quoted-field")
   113  	ErrFieldCount = errors.New("wrong number of fields")
   114  )
   115  
   116  var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")
   117  
   118  func validDelim(r rune) bool {
   119  	return r != 0 && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
   120  }
   121  
   122  // A Reader reads records from a CSV-encoded file.
   123  //
   124  // As returned by NewReader, a Reader expects input conforming to RFC 4180.
   125  // The exported fields can be changed to customize the details before the
   126  // first call to Read or ReadAll.
   127  type Reader struct {
   128  	// Comma is the field delimiter.
   129  	// It is set to comma (',') by NewReader.
   130  	Comma rune
   131  
   132  	// Comment, if not 0, is the comment character. Lines beginning with the
   133  	// Comment character without preceding whitespace are ignored.
   134  	// With leading whitespace the Comment character becomes part of the
   135  	// field, even if TrimLeadingSpace is true.
   136  	Comment rune
   137  
   138  	// FieldsPerRecord is the number of expected fields per record.
   139  	// If FieldsPerRecord is positive, Read requires each record to
   140  	// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
   141  	// the number of fields in the first record, so that future records must
   142  	// have the same field count. If FieldsPerRecord is negative, no check is
   143  	// made and records may have a variable number of fields.
   144  	FieldsPerRecord int
   145  
   146  	// If LazyQuotes is true, a quote may appear in an unquoted field and a
   147  	// non-doubled quote may appear in a quoted field.
   148  	LazyQuotes bool
   149  
   150  	// If TrimLeadingSpace is true, leading white space in a field is ignored.
   151  	// This is done even if the field delimiter, Comma, is white space.
   152  	TrimLeadingSpace bool
   153  
   154  	// ReuseRecord controls whether calls to Read may return a slice sharing
   155  	// the backing array of the previous call's returned slice for performance.
   156  	// By default, each call to Read returns newly allocated memory owned by the caller.
   157  	ReuseRecord bool
   158  
   159  	r *bufio.Reader
   160  
   161  	// numLine is the current line being read in the CSV file.
   162  	numLine int
   163  
   164  	// rawBuffer is a line buffer only used by the readLine method.
   165  	rawBuffer []byte
   166  
   167  	// recordBuffer holds the unescaped fields, one after another.
   168  	// The fields can be accessed by using the indexes in fieldIndexes.
   169  	// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
   170  	// and fieldIndexes will contain the indexes [1, 2, 5, 6].
   171  	recordBuffer []byte
   172  
   173  	// fieldIndexes is an index of fields inside recordBuffer.
   174  	// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
   175  	fieldIndexes []int
   176  
   177  	// lastRecord is a record cache and only used when ReuseRecord == true.
   178  	lastRecord []string
   179  }
   180  
   181  // NewReader returns a new Reader that reads from r.
   182  func NewReader(r io.Reader) *Reader {
   183  	return &Reader{
   184  		Comma: ',',
   185  		r:     bufio.NewReader(r),
   186  	}
   187  }
   188  
   189  // Read reads one record (a slice of fields) from r.
   190  // If the record has an unexpected number of fields,
   191  // Read returns the record along with the error ErrFieldCount.
   192  // Except for that case, Read always returns either a non-nil
   193  // record or a non-nil error, but not both.
   194  // If there is no data left to be read, Read returns nil, io.EOF.
   195  // If ReuseRecord is true, the returned slice may be shared
   196  // between multiple calls to Read.
   197  func (r *Reader) Read() (record []string, err error) {
   198  	if r.ReuseRecord {
   199  		record, err = r.readRecord(r.lastRecord)
   200  		r.lastRecord = record
   201  	} else {
   202  		record, err = r.readRecord(nil)
   203  	}
   204  	return record, err
   205  }
   206  
   207  // ReadAll reads all the remaining records from r.
   208  // Each record is a slice of fields.
   209  // A successful call returns err == nil, not err == io.EOF. Because ReadAll is
   210  // defined to read until EOF, it does not treat end of file as an error to be
   211  // reported.
   212  func (r *Reader) ReadAll() (records [][]string, err error) {
   213  	for {
   214  		record, err := r.readRecord(nil)
   215  		if err == io.EOF {
   216  			return records, nil
   217  		}
   218  		if err != nil {
   219  			return nil, err
   220  		}
   221  		records = append(records, record)
   222  	}
   223  }
   224  
   225  // readLine reads the next line (with the trailing endline).
   226  // If EOF is hit without a trailing endline, it will be omitted.
   227  // If some bytes were read, then the error is never io.EOF.
   228  // The result is only valid until the next call to readLine.
   229  func (r *Reader) readLine() ([]byte, error) {
   230  	line, err := r.r.ReadSlice('\n')
   231  	if errors.Is(err, bufio.ErrBufferFull) {
   232  		r.rawBuffer = append(r.rawBuffer[:0], line...)
   233  		for errors.Is(err, bufio.ErrBufferFull) {
   234  			line, err = r.r.ReadSlice('\n')
   235  			r.rawBuffer = append(r.rawBuffer, line...)
   236  		}
   237  		line = r.rawBuffer
   238  	}
   239  	if len(line) > 0 && err == io.EOF {
   240  		err = nil
   241  		// For backwards compatibility, drop trailing \r before EOF.
   242  		if line[len(line)-1] == '\r' {
   243  			line = line[:len(line)-1]
   244  		}
   245  	}
   246  	r.numLine++
   247  	return line, err
   248  }
   249  
   250  // lengthCRLF reports the number of bytes for a trailing "\r\n".
   251  func lengthCRLF(b []byte) int {
   252  	if j := len(b) - 1; j >= 0 && b[j] == '\n' {
   253  		if j := len(b) - 2; j >= 0 && b[j] == '\r' {
   254  			return 2
   255  		}
   256  		return 1
   257  	}
   258  	return 0
   259  }
   260  
   261  // nextRune returns the next rune in b or utf8.RuneError.
   262  func nextRune(b []byte) rune {
   263  	r, _ := utf8.DecodeRune(b)
   264  	return r
   265  }
   266  
   267  func (r *Reader) readRecord(dst []string) ([]string, error) {
   268  	if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
   269  		return nil, errInvalidDelim
   270  	}
   271  
   272  	// Read line (automatically skipping past empty lines and any comments).
   273  	var line, fullLine []byte
   274  	var errRead error
   275  	for errRead == nil {
   276  		line, errRead = r.readLine()
   277  		if r.Comment != 0 && nextRune(line) == r.Comment {
   278  			line = nil
   279  			continue // Skip comment lines
   280  		}
   281  		if errRead == nil && len(line) == lengthCRLF(line) {
   282  			line = nil
   283  			continue // Skip empty lines
   284  		}
   285  		fullLine = line
   286  		break
   287  	}
   288  	if errRead == io.EOF {
   289  		return nil, errRead
   290  	}
   291  
   292  	// Parse each field in the record.
   293  	var err error
   294  	const quoteLen = len(`"`)
   295  	commaLen := utf8.RuneLen(r.Comma)
   296  	recLine := r.numLine // Starting line for record
   297  	r.recordBuffer = r.recordBuffer[:0]
   298  	r.fieldIndexes = r.fieldIndexes[:0]
   299  parseField:
   300  	for {
   301  		if r.TrimLeadingSpace {
   302  			line = bytes.TrimLeftFunc(line, unicode.IsSpace)
   303  		}
   304  		if len(line) == 0 || line[0] != '"' {
   305  			// Non-quoted string field
   306  			i := bytes.IndexRune(line, r.Comma)
   307  			field := line
   308  			if i >= 0 {
   309  				field = field[:i]
   310  			} else {
   311  				field = field[:len(field)-lengthCRLF(field)]
   312  			}
   313  			// Check to make sure a quote does not appear in field.
   314  			if !r.LazyQuotes {
   315  				if j := bytes.IndexByte(field, '"'); j >= 0 {
   316  					col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
   317  					err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
   318  					break parseField
   319  				}
   320  			}
   321  			r.recordBuffer = append(r.recordBuffer, field...)
   322  			r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   323  			if i >= 0 {
   324  				line = line[i+commaLen:]
   325  				continue parseField
   326  			}
   327  			break parseField
   328  		} else {
   329  			// Quoted string field
   330  			line = line[quoteLen:]
   331  			for {
   332  				i := bytes.IndexByte(line, '"')
   333  				if i >= 0 {
   334  					// Hit next quote.
   335  					r.recordBuffer = append(r.recordBuffer, line[:i]...)
   336  					line = line[i+quoteLen:]
   337  					switch rn := nextRune(line); {
   338  					case rn == '"':
   339  						// `""` sequence (append quote).
   340  						r.recordBuffer = append(r.recordBuffer, '"')
   341  						line = line[quoteLen:]
   342  					case rn == r.Comma:
   343  						// `",` sequence (end of field).
   344  						line = line[commaLen:]
   345  						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   346  						continue parseField
   347  					case lengthCRLF(line) == len(line):
   348  						// `"\n` sequence (end of line).
   349  						r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   350  						break parseField
   351  					case r.LazyQuotes:
   352  						// `"` sequence (bare quote).
   353  						r.recordBuffer = append(r.recordBuffer, '"')
   354  					default:
   355  						// `"*` sequence (invalid non-escaped quote).
   356  						col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
   357  						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
   358  						break parseField
   359  					}
   360  				} else if len(line) > 0 {
   361  					// Hit end of line (copy all data so far).
   362  					r.recordBuffer = append(r.recordBuffer, line...)
   363  					if errRead != nil {
   364  						break parseField
   365  					}
   366  					line, errRead = r.readLine()
   367  					if errRead == io.EOF {
   368  						errRead = nil
   369  					}
   370  					fullLine = line
   371  				} else {
   372  					// Abrupt end of file (EOF or error).
   373  					if !r.LazyQuotes && errRead == nil {
   374  						col := utf8.RuneCount(fullLine)
   375  						err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
   376  						break parseField
   377  					}
   378  					r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
   379  					break parseField
   380  				}
   381  			}
   382  		}
   383  	}
   384  	if err == nil {
   385  		err = errRead
   386  	}
   387  
   388  	// Create a single string and create slices out of it.
   389  	// This pins the memory of the fields together, but allocates once.
   390  	str := string(r.recordBuffer) // Convert to string once to batch allocations
   391  	dst = dst[:0]
   392  	if cap(dst) < len(r.fieldIndexes) {
   393  		dst = make([]string, len(r.fieldIndexes))
   394  	}
   395  	dst = dst[:len(r.fieldIndexes)]
   396  	var preIdx int
   397  	for i, idx := range r.fieldIndexes {
   398  		dst[i] = str[preIdx:idx]
   399  		preIdx = idx
   400  	}
   401  
   402  	// Check or update the expected fields per record.
   403  	if r.FieldsPerRecord > 0 {
   404  		if len(dst) != r.FieldsPerRecord && err == nil {
   405  			err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
   406  		}
   407  	} else if r.FieldsPerRecord == 0 {
   408  		r.FieldsPerRecord = len(dst)
   409  	}
   410  	return dst, err
   411  }