github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/json/preader.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package json 19 20 import ( 21 "bufio" 22 "bytes" 23 "io" 24 "runtime" 25 "sync" 26 27 "github.com/bcicen/jstream" 28 "github.com/minio/minio/internal/s3select/sql" 29 ) 30 31 // PReader - JSON record reader for S3Select. 32 // Operates concurrently on line-delimited JSON. 33 type PReader struct { 34 args *ReaderArgs 35 readCloser io.ReadCloser // raw input 36 buf *bufio.Reader // input to the splitter 37 current []jstream.KVS // current block of results to be returned 38 recordsRead int // number of records read in current slice 39 input chan *queueItem // input for workers 40 queue chan *queueItem // output from workers in order 41 err error // global error state, only touched by Reader.Read 42 bufferPool sync.Pool // pool of []byte objects for input 43 kvDstPool sync.Pool // pool of []jstream.KV used for output 44 close chan struct{} // used for shutting down the splitter before end of stream 45 readerWg sync.WaitGroup // used to keep track of async reader. 46 } 47 48 // queueItem is an item in the queue. 49 type queueItem struct { 50 input []byte // raw input sent to the worker 51 dst chan []jstream.KVS // result of block decode 52 err error // any error encountered will be set here 53 } 54 55 // Read - reads single record. 56 // Once Read is called the previous record should no longer be referenced. 57 func (r *PReader) Read(dst sql.Record) (sql.Record, error) { 58 // If we have have any records left, return these before any error. 59 for len(r.current) <= r.recordsRead { 60 if r.err != nil { 61 return nil, r.err 62 } 63 // Move to next block 64 item, ok := <-r.queue 65 if !ok { 66 r.err = io.EOF 67 return nil, r.err 68 } 69 //nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer. 70 r.kvDstPool.Put(r.current) 71 r.current = <-item.dst 72 r.err = item.err 73 r.recordsRead = 0 74 } 75 kvRecord := r.current[r.recordsRead] 76 r.recordsRead++ 77 78 dstRec, ok := dst.(*Record) 79 if !ok { 80 dstRec = &Record{} 81 } 82 dstRec.KVS = kvRecord 83 dstRec.SelectFormat = sql.SelectFmtJSON 84 return dstRec, nil 85 } 86 87 // Close - closes underlying reader. 88 func (r *PReader) Close() error { 89 if r.close != nil { 90 close(r.close) 91 r.readerWg.Wait() 92 r.close = nil 93 } 94 r.recordsRead = len(r.current) 95 if r.err == nil { 96 r.err = io.EOF 97 } 98 return r.readCloser.Close() 99 } 100 101 // nextSplit will attempt to skip a number of bytes and 102 // return the buffer until the next newline occurs. 103 // The last block will be sent along with an io.EOF. 104 func (r *PReader) nextSplit(skip int, dst []byte) ([]byte, error) { 105 if cap(dst) < skip { 106 dst = make([]byte, 0, skip+1024) 107 } 108 dst = dst[:skip] 109 if skip > 0 { 110 n, err := io.ReadFull(r.buf, dst) 111 if err != nil && err != io.ErrUnexpectedEOF { 112 // If an EOF happens after reading some but not all the bytes, 113 // ReadFull returns ErrUnexpectedEOF. 114 return dst[:n], err 115 } 116 dst = dst[:n] 117 if err == io.ErrUnexpectedEOF { 118 return dst, io.EOF 119 } 120 } 121 // Read until next line. 122 in, err := r.buf.ReadBytes('\n') 123 dst = append(dst, in...) 124 return dst, err 125 } 126 127 // jsonSplitSize is the size of each block. 128 // Blocks will read this much and find the first following newline. 129 // 128KB appears to be a very reasonable default. 130 const jsonSplitSize = 128 << 10 131 132 // startReaders will read the header if needed and spin up a parser 133 // and a number of workers based on GOMAXPROCS. 134 // If an error is returned no goroutines have been started and r.err will have been set. 135 func (r *PReader) startReaders() { 136 r.bufferPool.New = func() interface{} { 137 return make([]byte, jsonSplitSize+1024) 138 } 139 140 // Create queue 141 r.queue = make(chan *queueItem, runtime.GOMAXPROCS(0)) 142 r.input = make(chan *queueItem, runtime.GOMAXPROCS(0)) 143 r.readerWg.Add(1) 144 145 // Start splitter 146 go func() { 147 defer close(r.input) 148 defer close(r.queue) 149 defer r.readerWg.Done() 150 for { 151 next, err := r.nextSplit(jsonSplitSize, r.bufferPool.Get().([]byte)) 152 q := queueItem{ 153 input: next, 154 dst: make(chan []jstream.KVS, 1), 155 err: err, 156 } 157 select { 158 case <-r.close: 159 return 160 case r.queue <- &q: 161 } 162 163 select { 164 case <-r.close: 165 return 166 case r.input <- &q: 167 } 168 if err != nil { 169 // Exit on any error. 170 return 171 } 172 } 173 }() 174 175 // Start parsers 176 for i := 0; i < runtime.GOMAXPROCS(0); i++ { 177 go func() { 178 for in := range r.input { 179 if len(in.input) == 0 { 180 in.dst <- nil 181 continue 182 } 183 dst, ok := r.kvDstPool.Get().([]jstream.KVS) 184 if !ok { 185 dst = make([]jstream.KVS, 0, 1000) 186 } 187 188 d := jstream.NewDecoder(bytes.NewBuffer(in.input), 0).ObjectAsKVS() 189 stream := d.Stream() 190 all := dst[:0] 191 for mv := range stream { 192 var kvs jstream.KVS 193 if mv.ValueType == jstream.Object { 194 // This is a JSON object type (that preserves key 195 // order) 196 kvs = mv.Value.(jstream.KVS) 197 } else { 198 // To be AWS S3 compatible Select for JSON needs to 199 // output non-object JSON as single column value 200 // i.e. a map with `_1` as key and value as the 201 // non-object. 202 kvs = jstream.KVS{jstream.KV{Key: "_1", Value: mv.Value}} 203 } 204 all = append(all, kvs) 205 } 206 // We don't need the input any more. 207 //nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer. 208 r.bufferPool.Put(in.input) 209 in.input = nil 210 in.err = d.Err() 211 in.dst <- all 212 } 213 }() 214 } 215 } 216 217 // NewPReader - creates new parallel JSON reader using readCloser. 218 // Should only be used for LINES types. 219 func NewPReader(readCloser io.ReadCloser, args *ReaderArgs) *PReader { 220 r := &PReader{ 221 args: args, 222 buf: bufio.NewReaderSize(readCloser, jsonSplitSize*2), 223 readCloser: readCloser, 224 close: make(chan struct{}), 225 } 226 r.startReaders() 227 return r 228 }