github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/csv/reader.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package csv 19 20 import ( 21 "bufio" 22 "bytes" 23 "fmt" 24 "io" 25 "runtime" 26 "sync" 27 "unicode/utf8" 28 29 csv "github.com/minio/csvparser" 30 "github.com/minio/minio/internal/s3select/sql" 31 ) 32 33 // Reader - CSV record reader for S3Select. 34 type Reader struct { 35 args *ReaderArgs 36 readCloser io.ReadCloser // raw input 37 buf *bufio.Reader // input to the splitter 38 columnNames []string // names of columns 39 nameIndexMap map[string]int64 // name to column index 40 current [][]string // current block of results to be returned 41 recordsRead int // number of records read in current slice 42 input chan *queueItem // input for workers 43 queue chan *queueItem // output from workers in order 44 err error // global error state, only touched by Reader.Read 45 bufferPool sync.Pool // pool of []byte objects for input 46 csvDstPool sync.Pool // pool of [][]string used for output 47 close chan struct{} // used for shutting down the splitter before end of stream 48 readerWg sync.WaitGroup // used to keep track of async reader. 49 } 50 51 // queueItem is an item in the queue. 52 type queueItem struct { 53 input []byte // raw input sent to the worker 54 dst chan [][]string // result of block decode 55 err error // any error encountered will be set here 56 } 57 58 // Read - reads single record. 59 // Once Read is called the previous record should no longer be referenced. 60 func (r *Reader) Read(dst sql.Record) (sql.Record, error) { 61 // If we have have any records left, return these before any error. 62 for len(r.current) <= r.recordsRead { 63 if r.err != nil { 64 return nil, r.err 65 } 66 // Move to next block 67 item, ok := <-r.queue 68 if !ok { 69 r.err = io.EOF 70 return nil, r.err 71 } 72 //nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer. 73 r.csvDstPool.Put(r.current) 74 r.current = <-item.dst 75 r.err = item.err 76 r.recordsRead = 0 77 } 78 csvRecord := r.current[r.recordsRead] 79 r.recordsRead++ 80 81 // If no column names are set, use _(index) 82 if r.columnNames == nil { 83 r.columnNames = make([]string, len(csvRecord)) 84 for i := range csvRecord { 85 r.columnNames[i] = fmt.Sprintf("_%v", i+1) 86 } 87 } 88 89 // If no index map, add that. 90 if r.nameIndexMap == nil { 91 r.nameIndexMap = make(map[string]int64) 92 for i := range r.columnNames { 93 r.nameIndexMap[r.columnNames[i]] = int64(i) 94 } 95 } 96 dstRec, ok := dst.(*Record) 97 if !ok { 98 dstRec = &Record{} 99 } 100 dstRec.columnNames = r.columnNames 101 dstRec.csvRecord = csvRecord 102 dstRec.nameIndexMap = r.nameIndexMap 103 104 return dstRec, nil 105 } 106 107 // Close - closes underlying reader. 108 func (r *Reader) Close() error { 109 if r.close != nil { 110 close(r.close) 111 r.readerWg.Wait() 112 r.close = nil 113 } 114 r.recordsRead = len(r.current) 115 if r.err == nil { 116 r.err = io.EOF 117 } 118 return r.readCloser.Close() 119 } 120 121 // nextSplit will attempt to skip a number of bytes and 122 // return the buffer until the next newline occurs. 123 // The last block will be sent along with an io.EOF. 124 func (r *Reader) nextSplit(skip int, dst []byte) ([]byte, error) { 125 if cap(dst) < skip { 126 dst = make([]byte, 0, skip+1024) 127 } 128 dst = dst[:skip] 129 if skip > 0 { 130 n, err := io.ReadFull(r.buf, dst) 131 if err != nil && err != io.ErrUnexpectedEOF { 132 // If an EOF happens after reading some but not all the bytes, 133 // ReadFull returns ErrUnexpectedEOF. 134 return dst[:n], err 135 } 136 dst = dst[:n] 137 if err == io.ErrUnexpectedEOF { 138 return dst, io.EOF 139 } 140 } 141 // Read until next line. 142 in, err := r.buf.ReadBytes('\n') 143 dst = append(dst, in...) 144 return dst, err 145 } 146 147 // csvSplitSize is the size of each block. 148 // Blocks will read this much and find the first following newline. 149 // 128KB appears to be a very reasonable default. 150 const csvSplitSize = 128 << 10 151 152 // startReaders will read the header if needed and spin up a parser 153 // and a number of workers based on GOMAXPROCS. 154 // If an error is returned no goroutines have been started and r.err will have been set. 155 func (r *Reader) startReaders(newReader func(io.Reader) *csv.Reader) error { 156 if r.args.FileHeaderInfo != none { 157 // Read column names 158 // Get one line. 159 b, err := r.nextSplit(0, nil) 160 if err != nil { 161 r.err = err 162 return err 163 } 164 if !utf8.Valid(b) { 165 return errInvalidTextEncodingError() 166 } 167 reader := newReader(bytes.NewReader(b)) 168 record, err := reader.Read() 169 if err != nil { 170 r.err = err 171 if err != io.EOF { 172 r.err = errCSVParsingError(err) 173 return errCSVParsingError(err) 174 } 175 return err 176 } 177 178 if r.args.FileHeaderInfo == use { 179 // Copy column names since records will be reused. 180 columns := append(make([]string, 0, len(record)), record...) 181 r.columnNames = columns 182 } 183 } 184 185 r.bufferPool.New = func() interface{} { 186 return make([]byte, csvSplitSize+1024) 187 } 188 189 // Return first block 190 next, nextErr := r.nextSplit(csvSplitSize, r.bufferPool.Get().([]byte)) 191 // Check if first block is valid. 192 if !utf8.Valid(next) { 193 return errInvalidTextEncodingError() 194 } 195 196 // Create queue 197 r.queue = make(chan *queueItem, runtime.GOMAXPROCS(0)) 198 r.input = make(chan *queueItem, runtime.GOMAXPROCS(0)) 199 r.readerWg.Add(1) 200 201 // Start splitter 202 go func() { 203 defer close(r.input) 204 defer close(r.queue) 205 defer r.readerWg.Done() 206 for { 207 q := queueItem{ 208 input: next, 209 dst: make(chan [][]string, 1), 210 err: nextErr, 211 } 212 select { 213 case <-r.close: 214 return 215 case r.queue <- &q: 216 } 217 218 select { 219 case <-r.close: 220 return 221 case r.input <- &q: 222 } 223 if nextErr != nil { 224 // Exit on any error. 225 return 226 } 227 next, nextErr = r.nextSplit(csvSplitSize, r.bufferPool.Get().([]byte)) 228 } 229 }() 230 231 // Start parsers 232 for i := 0; i < runtime.GOMAXPROCS(0); i++ { 233 go func() { 234 for in := range r.input { 235 if len(in.input) == 0 { 236 in.dst <- nil 237 continue 238 } 239 dst, ok := r.csvDstPool.Get().([][]string) 240 if !ok { 241 dst = make([][]string, 0, 1000) 242 } 243 244 cr := newReader(bytes.NewBuffer(in.input)) 245 all := dst[:0] 246 err := func() error { 247 // Read all records until EOF or another error. 248 for { 249 record, err := cr.Read() 250 if err == io.EOF { 251 return nil 252 } 253 if err != nil { 254 return errCSVParsingError(err) 255 } 256 var recDst []string 257 if len(dst) > len(all) { 258 recDst = dst[len(all)] 259 } 260 if cap(recDst) < len(record) { 261 recDst = make([]string, len(record)) 262 } 263 recDst = recDst[:len(record)] 264 copy(recDst, record) 265 all = append(all, recDst) 266 } 267 }() 268 if err != nil { 269 in.err = err 270 } 271 // We don't need the input any more. 272 //nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer. 273 r.bufferPool.Put(in.input) 274 in.input = nil 275 in.dst <- all 276 } 277 }() 278 } 279 return nil 280 } 281 282 // NewReader - creates new CSV reader using readCloser. 283 func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) { 284 if args == nil || args.IsEmpty() { 285 panic(fmt.Errorf("empty args passed %v", args)) 286 } 287 csvIn := io.Reader(readCloser) 288 if args.RecordDelimiter != "\n" { 289 csvIn = &recordTransform{ 290 reader: readCloser, 291 recordDelimiter: []byte(args.RecordDelimiter), 292 oneByte: make([]byte, len(args.RecordDelimiter)-1), 293 } 294 } 295 296 r := &Reader{ 297 args: args, 298 buf: bufio.NewReaderSize(csvIn, csvSplitSize*2), 299 readCloser: readCloser, 300 close: make(chan struct{}), 301 } 302 303 // Assume args are validated by ReaderArgs.UnmarshalXML() 304 newCsvReader := func(r io.Reader) *csv.Reader { 305 ret := csv.NewReader(r) 306 ret.Comma = []rune(args.FieldDelimiter)[0] 307 ret.Comment = []rune(args.CommentCharacter)[0] 308 ret.Quote = []rune{} 309 if len([]rune(args.QuoteCharacter)) > 0 { 310 // Add the first rune of args.QuoteCharacter 311 ret.Quote = append(ret.Quote, []rune(args.QuoteCharacter)[0]) 312 } 313 ret.QuoteEscape = []rune(args.QuoteEscapeCharacter)[0] 314 ret.FieldsPerRecord = -1 315 // If LazyQuotes is true, a quote may appear in an unquoted field and a 316 // non-doubled quote may appear in a quoted field. 317 ret.LazyQuotes = true 318 // We do not trim leading space to keep consistent with s3. 319 ret.TrimLeadingSpace = false 320 ret.ReuseRecord = true 321 return ret 322 } 323 324 return r, r.startReaders(newCsvReader) 325 }