storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/csv/reader.go (about) 1 /* 2 * MinIO Cloud Storage, (C) 2019 MinIO, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package csv 18 19 import ( 20 "bufio" 21 "bytes" 22 "fmt" 23 "io" 24 "runtime" 25 "sync" 26 "unicode/utf8" 27 28 csv "storj.io/minio/pkg/csvparser" 29 "storj.io/minio/pkg/s3select/sql" 30 ) 31 32 // Reader - CSV record reader for S3Select. 33 type Reader struct { 34 args *ReaderArgs 35 readCloser io.ReadCloser // raw input 36 buf *bufio.Reader // input to the splitter 37 columnNames []string // names of columns 38 nameIndexMap map[string]int64 // name to column index 39 current [][]string // current block of results to be returned 40 recordsRead int // number of records read in current slice 41 input chan *queueItem // input for workers 42 queue chan *queueItem // output from workers in order 43 err error // global error state, only touched by Reader.Read 44 bufferPool sync.Pool // pool of []byte objects for input 45 csvDstPool sync.Pool // pool of [][]string used for output 46 close chan struct{} // used for shutting down the splitter before end of stream 47 readerWg sync.WaitGroup // used to keep track of async reader. 48 } 49 50 // queueItem is an item in the queue. 51 type queueItem struct { 52 input []byte // raw input sent to the worker 53 dst chan [][]string // result of block decode 54 err error // any error encountered will be set here 55 } 56 57 // Read - reads single record. 58 // Once Read is called the previous record should no longer be referenced. 59 func (r *Reader) Read(dst sql.Record) (sql.Record, error) { 60 // If we have have any records left, return these before any error. 61 for len(r.current) <= r.recordsRead { 62 if r.err != nil { 63 return nil, r.err 64 } 65 // Move to next block 66 item, ok := <-r.queue 67 if !ok { 68 r.err = io.EOF 69 return nil, r.err 70 } 71 //lint:ignore SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer. 72 r.csvDstPool.Put(r.current) 73 r.current = <-item.dst 74 r.err = item.err 75 r.recordsRead = 0 76 } 77 csvRecord := r.current[r.recordsRead] 78 r.recordsRead++ 79 80 // If no column names are set, use _(index) 81 if r.columnNames == nil { 82 r.columnNames = make([]string, len(csvRecord)) 83 for i := range csvRecord { 84 r.columnNames[i] = fmt.Sprintf("_%v", i+1) 85 } 86 } 87 88 // If no index map, add that. 89 if r.nameIndexMap == nil { 90 r.nameIndexMap = make(map[string]int64) 91 for i := range r.columnNames { 92 r.nameIndexMap[r.columnNames[i]] = int64(i) 93 } 94 } 95 dstRec, ok := dst.(*Record) 96 if !ok { 97 dstRec = &Record{} 98 } 99 dstRec.columnNames = r.columnNames 100 dstRec.csvRecord = csvRecord 101 dstRec.nameIndexMap = r.nameIndexMap 102 103 return dstRec, nil 104 } 105 106 // Close - closes underlying reader. 107 func (r *Reader) Close() error { 108 if r.close != nil { 109 close(r.close) 110 r.readerWg.Wait() 111 r.close = nil 112 } 113 r.recordsRead = len(r.current) 114 if r.err == nil { 115 r.err = io.EOF 116 } 117 return r.readCloser.Close() 118 } 119 120 // nextSplit will attempt to skip a number of bytes and 121 // return the buffer until the next newline occurs. 122 // The last block will be sent along with an io.EOF. 123 func (r *Reader) nextSplit(skip int, dst []byte) ([]byte, error) { 124 if cap(dst) < skip { 125 dst = make([]byte, 0, skip+1024) 126 } 127 dst = dst[:skip] 128 if skip > 0 { 129 n, err := io.ReadFull(r.buf, dst) 130 if err != nil && err != io.ErrUnexpectedEOF { 131 // If an EOF happens after reading some but not all the bytes, 132 // ReadFull returns ErrUnexpectedEOF. 133 return dst[:n], err 134 } 135 dst = dst[:n] 136 if err == io.ErrUnexpectedEOF { 137 return dst, io.EOF 138 } 139 } 140 // Read until next line. 141 in, err := r.buf.ReadBytes('\n') 142 dst = append(dst, in...) 143 return dst, err 144 } 145 146 // csvSplitSize is the size of each block. 147 // Blocks will read this much and find the first following newline. 148 // 128KB appears to be a very reasonable default. 149 const csvSplitSize = 128 << 10 150 151 // startReaders will read the header if needed and spin up a parser 152 // and a number of workers based on GOMAXPROCS. 153 // If an error is returned no goroutines have been started and r.err will have been set. 154 func (r *Reader) startReaders(newReader func(io.Reader) *csv.Reader) error { 155 if r.args.FileHeaderInfo != none { 156 // Read column names 157 // Get one line. 158 b, err := r.nextSplit(0, nil) 159 if err != nil { 160 r.err = err 161 return err 162 } 163 if !utf8.Valid(b) { 164 return errInvalidTextEncodingError() 165 } 166 reader := newReader(bytes.NewReader(b)) 167 record, err := reader.Read() 168 if err != nil { 169 r.err = err 170 if err != io.EOF { 171 r.err = errCSVParsingError(err) 172 return errCSVParsingError(err) 173 } 174 return err 175 } 176 177 if r.args.FileHeaderInfo == use { 178 // Copy column names since records will be reused. 179 columns := append(make([]string, 0, len(record)), record...) 180 r.columnNames = columns 181 } 182 } 183 184 r.bufferPool.New = func() interface{} { 185 return make([]byte, csvSplitSize+1024) 186 } 187 188 // Return first block 189 next, nextErr := r.nextSplit(csvSplitSize, r.bufferPool.Get().([]byte)) 190 // Check if first block is valid. 191 if !utf8.Valid(next) { 192 return errInvalidTextEncodingError() 193 } 194 195 // Create queue 196 r.queue = make(chan *queueItem, runtime.GOMAXPROCS(0)) 197 r.input = make(chan *queueItem, runtime.GOMAXPROCS(0)) 198 r.readerWg.Add(1) 199 200 // Start splitter 201 go func() { 202 defer close(r.input) 203 defer close(r.queue) 204 defer r.readerWg.Done() 205 for { 206 q := queueItem{ 207 input: next, 208 dst: make(chan [][]string, 1), 209 err: nextErr, 210 } 211 select { 212 case <-r.close: 213 return 214 case r.queue <- &q: 215 } 216 217 select { 218 case <-r.close: 219 return 220 case r.input <- &q: 221 } 222 if nextErr != nil { 223 // Exit on any error. 224 return 225 } 226 next, nextErr = r.nextSplit(csvSplitSize, r.bufferPool.Get().([]byte)) 227 } 228 }() 229 230 // Start parsers 231 for i := 0; i < runtime.GOMAXPROCS(0); i++ { 232 go func() { 233 for in := range r.input { 234 if len(in.input) == 0 { 235 in.dst <- nil 236 continue 237 } 238 dst, ok := r.csvDstPool.Get().([][]string) 239 if !ok { 240 dst = make([][]string, 0, 1000) 241 } 242 243 cr := newReader(bytes.NewBuffer(in.input)) 244 all := dst[:0] 245 err := func() error { 246 // Read all records until EOF or another error. 247 for { 248 record, err := cr.Read() 249 if err == io.EOF { 250 return nil 251 } 252 if err != nil { 253 return errCSVParsingError(err) 254 } 255 var recDst []string 256 if len(dst) > len(all) { 257 recDst = dst[len(all)] 258 } 259 if cap(recDst) < len(record) { 260 recDst = make([]string, len(record)) 261 } 262 recDst = recDst[:len(record)] 263 copy(recDst, record) 264 all = append(all, recDst) 265 } 266 }() 267 if err != nil { 268 in.err = err 269 } 270 // We don't need the input any more. 271 //lint:ignore SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer. 272 r.bufferPool.Put(in.input) 273 in.input = nil 274 in.dst <- all 275 } 276 }() 277 } 278 return nil 279 280 } 281 282 // NewReader - creates new CSV reader using readCloser. 283 func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) { 284 if args == nil || args.IsEmpty() { 285 panic(fmt.Errorf("empty args passed %v", args)) 286 } 287 csvIn := io.Reader(readCloser) 288 if args.RecordDelimiter != "\n" { 289 csvIn = &recordTransform{ 290 reader: readCloser, 291 recordDelimiter: []byte(args.RecordDelimiter), 292 oneByte: make([]byte, len(args.RecordDelimiter)-1), 293 } 294 } 295 296 r := &Reader{ 297 args: args, 298 buf: bufio.NewReaderSize(csvIn, csvSplitSize*2), 299 readCloser: readCloser, 300 close: make(chan struct{}), 301 } 302 303 // Assume args are validated by ReaderArgs.UnmarshalXML() 304 newCsvReader := func(r io.Reader) *csv.Reader { 305 ret := csv.NewReader(r) 306 ret.Comma = []rune(args.FieldDelimiter)[0] 307 ret.Comment = []rune(args.CommentCharacter)[0] 308 ret.Quote = []rune{} 309 if len([]rune(args.QuoteCharacter)) > 0 { 310 // Add the first rune of args.QuoteChracter 311 ret.Quote = append(ret.Quote, []rune(args.QuoteCharacter)[0]) 312 } 313 ret.QuoteEscape = []rune(args.QuoteEscapeCharacter)[0] 314 ret.FieldsPerRecord = -1 315 // If LazyQuotes is true, a quote may appear in an unquoted field and a 316 // non-doubled quote may appear in a quoted field. 317 ret.LazyQuotes = true 318 // We do not trim leading space to keep consistent with s3. 319 ret.TrimLeadingSpace = false 320 ret.ReuseRecord = true 321 return ret 322 } 323 324 return r, r.startReaders(newCsvReader) 325 }