github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/csv/reader.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package csv
    19  
    20  import (
    21  	"bufio"
    22  	"bytes"
    23  	"fmt"
    24  	"io"
    25  	"runtime"
    26  	"sync"
    27  	"unicode/utf8"
    28  
    29  	csv "github.com/minio/csvparser"
    30  	"github.com/minio/minio/internal/s3select/sql"
    31  )
    32  
    33  // Reader - CSV record reader for S3Select.
    34  type Reader struct {
    35  	args         *ReaderArgs
    36  	readCloser   io.ReadCloser    // raw input
    37  	buf          *bufio.Reader    // input to the splitter
    38  	columnNames  []string         // names of columns
    39  	nameIndexMap map[string]int64 // name to column index
    40  	current      [][]string       // current block of results to be returned
    41  	recordsRead  int              // number of records read in current slice
    42  	input        chan *queueItem  // input for workers
    43  	queue        chan *queueItem  // output from workers in order
    44  	err          error            // global error state, only touched by Reader.Read
    45  	bufferPool   sync.Pool        // pool of []byte objects for input
    46  	csvDstPool   sync.Pool        // pool of [][]string used for output
    47  	close        chan struct{}    // used for shutting down the splitter before end of stream
    48  	readerWg     sync.WaitGroup   // used to keep track of async reader.
    49  }
    50  
    51  // queueItem is an item in the queue.
    52  type queueItem struct {
    53  	input []byte          // raw input sent to the worker
    54  	dst   chan [][]string // result of block decode
    55  	err   error           // any error encountered will be set here
    56  }
    57  
    58  // Read - reads single record.
    59  // Once Read is called the previous record should no longer be referenced.
    60  func (r *Reader) Read(dst sql.Record) (sql.Record, error) {
    61  	// If we have have any records left, return these before any error.
    62  	for len(r.current) <= r.recordsRead {
    63  		if r.err != nil {
    64  			return nil, r.err
    65  		}
    66  		// Move to next block
    67  		item, ok := <-r.queue
    68  		if !ok {
    69  			r.err = io.EOF
    70  			return nil, r.err
    71  		}
    72  		//nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer.
    73  		r.csvDstPool.Put(r.current)
    74  		r.current = <-item.dst
    75  		r.err = item.err
    76  		r.recordsRead = 0
    77  	}
    78  	csvRecord := r.current[r.recordsRead]
    79  	r.recordsRead++
    80  
    81  	// If no column names are set, use _(index)
    82  	if r.columnNames == nil {
    83  		r.columnNames = make([]string, len(csvRecord))
    84  		for i := range csvRecord {
    85  			r.columnNames[i] = fmt.Sprintf("_%v", i+1)
    86  		}
    87  	}
    88  
    89  	// If no index map, add that.
    90  	if r.nameIndexMap == nil {
    91  		r.nameIndexMap = make(map[string]int64)
    92  		for i := range r.columnNames {
    93  			r.nameIndexMap[r.columnNames[i]] = int64(i)
    94  		}
    95  	}
    96  	dstRec, ok := dst.(*Record)
    97  	if !ok {
    98  		dstRec = &Record{}
    99  	}
   100  	dstRec.columnNames = r.columnNames
   101  	dstRec.csvRecord = csvRecord
   102  	dstRec.nameIndexMap = r.nameIndexMap
   103  
   104  	return dstRec, nil
   105  }
   106  
   107  // Close - closes underlying reader.
   108  func (r *Reader) Close() error {
   109  	if r.close != nil {
   110  		close(r.close)
   111  		r.readerWg.Wait()
   112  		r.close = nil
   113  	}
   114  	r.recordsRead = len(r.current)
   115  	if r.err == nil {
   116  		r.err = io.EOF
   117  	}
   118  	return r.readCloser.Close()
   119  }
   120  
   121  // nextSplit will attempt to skip a number of bytes and
   122  // return the buffer until the next newline occurs.
   123  // The last block will be sent along with an io.EOF.
   124  func (r *Reader) nextSplit(skip int, dst []byte) ([]byte, error) {
   125  	if cap(dst) < skip {
   126  		dst = make([]byte, 0, skip+1024)
   127  	}
   128  	dst = dst[:skip]
   129  	if skip > 0 {
   130  		n, err := io.ReadFull(r.buf, dst)
   131  		if err != nil && err != io.ErrUnexpectedEOF {
   132  			// If an EOF happens after reading some but not all the bytes,
   133  			// ReadFull returns ErrUnexpectedEOF.
   134  			return dst[:n], err
   135  		}
   136  		dst = dst[:n]
   137  		if err == io.ErrUnexpectedEOF {
   138  			return dst, io.EOF
   139  		}
   140  	}
   141  	// Read until next line.
   142  	in, err := r.buf.ReadBytes('\n')
   143  	dst = append(dst, in...)
   144  	return dst, err
   145  }
   146  
   147  // csvSplitSize is the size of each block.
   148  // Blocks will read this much and find the first following newline.
   149  // 128KB appears to be a very reasonable default.
   150  const csvSplitSize = 128 << 10
   151  
   152  // startReaders will read the header if needed and spin up a parser
   153  // and a number of workers based on GOMAXPROCS.
   154  // If an error is returned no goroutines have been started and r.err will have been set.
   155  func (r *Reader) startReaders(newReader func(io.Reader) *csv.Reader) error {
   156  	if r.args.FileHeaderInfo != none {
   157  		// Read column names
   158  		// Get one line.
   159  		b, err := r.nextSplit(0, nil)
   160  		if err != nil {
   161  			r.err = err
   162  			return err
   163  		}
   164  		if !utf8.Valid(b) {
   165  			return errInvalidTextEncodingError()
   166  		}
   167  		reader := newReader(bytes.NewReader(b))
   168  		record, err := reader.Read()
   169  		if err != nil {
   170  			r.err = err
   171  			if err != io.EOF {
   172  				r.err = errCSVParsingError(err)
   173  				return errCSVParsingError(err)
   174  			}
   175  			return err
   176  		}
   177  
   178  		if r.args.FileHeaderInfo == use {
   179  			// Copy column names since records will be reused.
   180  			columns := append(make([]string, 0, len(record)), record...)
   181  			r.columnNames = columns
   182  		}
   183  	}
   184  
   185  	r.bufferPool.New = func() interface{} {
   186  		return make([]byte, csvSplitSize+1024)
   187  	}
   188  
   189  	// Return first block
   190  	next, nextErr := r.nextSplit(csvSplitSize, r.bufferPool.Get().([]byte))
   191  	// Check if first block is valid.
   192  	if !utf8.Valid(next) {
   193  		return errInvalidTextEncodingError()
   194  	}
   195  
   196  	// Create queue
   197  	r.queue = make(chan *queueItem, runtime.GOMAXPROCS(0))
   198  	r.input = make(chan *queueItem, runtime.GOMAXPROCS(0))
   199  	r.readerWg.Add(1)
   200  
   201  	// Start splitter
   202  	go func() {
   203  		defer close(r.input)
   204  		defer close(r.queue)
   205  		defer r.readerWg.Done()
   206  		for {
   207  			q := queueItem{
   208  				input: next,
   209  				dst:   make(chan [][]string, 1),
   210  				err:   nextErr,
   211  			}
   212  			select {
   213  			case <-r.close:
   214  				return
   215  			case r.queue <- &q:
   216  			}
   217  
   218  			select {
   219  			case <-r.close:
   220  				return
   221  			case r.input <- &q:
   222  			}
   223  			if nextErr != nil {
   224  				// Exit on any error.
   225  				return
   226  			}
   227  			next, nextErr = r.nextSplit(csvSplitSize, r.bufferPool.Get().([]byte))
   228  		}
   229  	}()
   230  
   231  	// Start parsers
   232  	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
   233  		go func() {
   234  			for in := range r.input {
   235  				if len(in.input) == 0 {
   236  					in.dst <- nil
   237  					continue
   238  				}
   239  				dst, ok := r.csvDstPool.Get().([][]string)
   240  				if !ok {
   241  					dst = make([][]string, 0, 1000)
   242  				}
   243  
   244  				cr := newReader(bytes.NewBuffer(in.input))
   245  				all := dst[:0]
   246  				err := func() error {
   247  					// Read all records until EOF or another error.
   248  					for {
   249  						record, err := cr.Read()
   250  						if err == io.EOF {
   251  							return nil
   252  						}
   253  						if err != nil {
   254  							return errCSVParsingError(err)
   255  						}
   256  						var recDst []string
   257  						if len(dst) > len(all) {
   258  							recDst = dst[len(all)]
   259  						}
   260  						if cap(recDst) < len(record) {
   261  							recDst = make([]string, len(record))
   262  						}
   263  						recDst = recDst[:len(record)]
   264  						copy(recDst, record)
   265  						all = append(all, recDst)
   266  					}
   267  				}()
   268  				if err != nil {
   269  					in.err = err
   270  				}
   271  				// We don't need the input any more.
   272  				//nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer.
   273  				r.bufferPool.Put(in.input)
   274  				in.input = nil
   275  				in.dst <- all
   276  			}
   277  		}()
   278  	}
   279  	return nil
   280  }
   281  
   282  // NewReader - creates new CSV reader using readCloser.
   283  func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) {
   284  	if args == nil || args.IsEmpty() {
   285  		panic(fmt.Errorf("empty args passed %v", args))
   286  	}
   287  	csvIn := io.Reader(readCloser)
   288  	if args.RecordDelimiter != "\n" {
   289  		csvIn = &recordTransform{
   290  			reader:          readCloser,
   291  			recordDelimiter: []byte(args.RecordDelimiter),
   292  			oneByte:         make([]byte, len(args.RecordDelimiter)-1),
   293  		}
   294  	}
   295  
   296  	r := &Reader{
   297  		args:       args,
   298  		buf:        bufio.NewReaderSize(csvIn, csvSplitSize*2),
   299  		readCloser: readCloser,
   300  		close:      make(chan struct{}),
   301  	}
   302  
   303  	// Assume args are validated by ReaderArgs.UnmarshalXML()
   304  	newCsvReader := func(r io.Reader) *csv.Reader {
   305  		ret := csv.NewReader(r)
   306  		ret.Comma = []rune(args.FieldDelimiter)[0]
   307  		ret.Comment = []rune(args.CommentCharacter)[0]
   308  		ret.Quote = []rune{}
   309  		if len([]rune(args.QuoteCharacter)) > 0 {
   310  			// Add the first rune of args.QuoteCharacter
   311  			ret.Quote = append(ret.Quote, []rune(args.QuoteCharacter)[0])
   312  		}
   313  		ret.QuoteEscape = []rune(args.QuoteEscapeCharacter)[0]
   314  		ret.FieldsPerRecord = -1
   315  		// If LazyQuotes is true, a quote may appear in an unquoted field and a
   316  		// non-doubled quote may appear in a quoted field.
   317  		ret.LazyQuotes = true
   318  		// We do not trim leading space to keep consistent with s3.
   319  		ret.TrimLeadingSpace = false
   320  		ret.ReuseRecord = true
   321  		return ret
   322  	}
   323  
   324  	return r, r.startReaders(newCsvReader)
   325  }