github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/json/preader.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package json
    19  
    20  import (
    21  	"bufio"
    22  	"bytes"
    23  	"io"
    24  	"runtime"
    25  	"sync"
    26  
    27  	"github.com/bcicen/jstream"
    28  	"github.com/minio/minio/internal/s3select/sql"
    29  )
    30  
    31  // PReader - JSON record reader for S3Select.
    32  // Operates concurrently on line-delimited JSON.
    33  type PReader struct {
    34  	args        *ReaderArgs
    35  	readCloser  io.ReadCloser   // raw input
    36  	buf         *bufio.Reader   // input to the splitter
    37  	current     []jstream.KVS   // current block of results to be returned
    38  	recordsRead int             // number of records read in current slice
    39  	input       chan *queueItem // input for workers
    40  	queue       chan *queueItem // output from workers in order
    41  	err         error           // global error state, only touched by Reader.Read
    42  	bufferPool  sync.Pool       // pool of []byte objects for input
    43  	kvDstPool   sync.Pool       // pool of []jstream.KV used for output
    44  	close       chan struct{}   // used for shutting down the splitter before end of stream
    45  	readerWg    sync.WaitGroup  // used to keep track of async reader.
    46  }
    47  
    48  // queueItem is an item in the queue.
    49  type queueItem struct {
    50  	input []byte             // raw input sent to the worker
    51  	dst   chan []jstream.KVS // result of block decode
    52  	err   error              // any error encountered will be set here
    53  }
    54  
    55  // Read - reads single record.
    56  // Once Read is called the previous record should no longer be referenced.
    57  func (r *PReader) Read(dst sql.Record) (sql.Record, error) {
    58  	// If we have have any records left, return these before any error.
    59  	for len(r.current) <= r.recordsRead {
    60  		if r.err != nil {
    61  			return nil, r.err
    62  		}
    63  		// Move to next block
    64  		item, ok := <-r.queue
    65  		if !ok {
    66  			r.err = io.EOF
    67  			return nil, r.err
    68  		}
    69  		//nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer.
    70  		r.kvDstPool.Put(r.current)
    71  		r.current = <-item.dst
    72  		r.err = item.err
    73  		r.recordsRead = 0
    74  	}
    75  	kvRecord := r.current[r.recordsRead]
    76  	r.recordsRead++
    77  
    78  	dstRec, ok := dst.(*Record)
    79  	if !ok {
    80  		dstRec = &Record{}
    81  	}
    82  	dstRec.KVS = kvRecord
    83  	dstRec.SelectFormat = sql.SelectFmtJSON
    84  	return dstRec, nil
    85  }
    86  
    87  // Close - closes underlying reader.
    88  func (r *PReader) Close() error {
    89  	if r.close != nil {
    90  		close(r.close)
    91  		r.readerWg.Wait()
    92  		r.close = nil
    93  	}
    94  	r.recordsRead = len(r.current)
    95  	if r.err == nil {
    96  		r.err = io.EOF
    97  	}
    98  	return r.readCloser.Close()
    99  }
   100  
   101  // nextSplit will attempt to skip a number of bytes and
   102  // return the buffer until the next newline occurs.
   103  // The last block will be sent along with an io.EOF.
   104  func (r *PReader) nextSplit(skip int, dst []byte) ([]byte, error) {
   105  	if cap(dst) < skip {
   106  		dst = make([]byte, 0, skip+1024)
   107  	}
   108  	dst = dst[:skip]
   109  	if skip > 0 {
   110  		n, err := io.ReadFull(r.buf, dst)
   111  		if err != nil && err != io.ErrUnexpectedEOF {
   112  			// If an EOF happens after reading some but not all the bytes,
   113  			// ReadFull returns ErrUnexpectedEOF.
   114  			return dst[:n], err
   115  		}
   116  		dst = dst[:n]
   117  		if err == io.ErrUnexpectedEOF {
   118  			return dst, io.EOF
   119  		}
   120  	}
   121  	// Read until next line.
   122  	in, err := r.buf.ReadBytes('\n')
   123  	dst = append(dst, in...)
   124  	return dst, err
   125  }
   126  
   127  // jsonSplitSize is the size of each block.
   128  // Blocks will read this much and find the first following newline.
   129  // 128KB appears to be a very reasonable default.
   130  const jsonSplitSize = 128 << 10
   131  
   132  // startReaders will read the header if needed and spin up a parser
   133  // and a number of workers based on GOMAXPROCS.
   134  // If an error is returned no goroutines have been started and r.err will have been set.
   135  func (r *PReader) startReaders() {
   136  	r.bufferPool.New = func() interface{} {
   137  		return make([]byte, jsonSplitSize+1024)
   138  	}
   139  
   140  	// Create queue
   141  	r.queue = make(chan *queueItem, runtime.GOMAXPROCS(0))
   142  	r.input = make(chan *queueItem, runtime.GOMAXPROCS(0))
   143  	r.readerWg.Add(1)
   144  
   145  	// Start splitter
   146  	go func() {
   147  		defer close(r.input)
   148  		defer close(r.queue)
   149  		defer r.readerWg.Done()
   150  		for {
   151  			next, err := r.nextSplit(jsonSplitSize, r.bufferPool.Get().([]byte))
   152  			q := queueItem{
   153  				input: next,
   154  				dst:   make(chan []jstream.KVS, 1),
   155  				err:   err,
   156  			}
   157  			select {
   158  			case <-r.close:
   159  				return
   160  			case r.queue <- &q:
   161  			}
   162  
   163  			select {
   164  			case <-r.close:
   165  				return
   166  			case r.input <- &q:
   167  			}
   168  			if err != nil {
   169  				// Exit on any error.
   170  				return
   171  			}
   172  		}
   173  	}()
   174  
   175  	// Start parsers
   176  	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
   177  		go func() {
   178  			for in := range r.input {
   179  				if len(in.input) == 0 {
   180  					in.dst <- nil
   181  					continue
   182  				}
   183  				dst, ok := r.kvDstPool.Get().([]jstream.KVS)
   184  				if !ok {
   185  					dst = make([]jstream.KVS, 0, 1000)
   186  				}
   187  
   188  				d := jstream.NewDecoder(bytes.NewBuffer(in.input), 0).ObjectAsKVS()
   189  				stream := d.Stream()
   190  				all := dst[:0]
   191  				for mv := range stream {
   192  					var kvs jstream.KVS
   193  					if mv.ValueType == jstream.Object {
   194  						// This is a JSON object type (that preserves key
   195  						// order)
   196  						kvs = mv.Value.(jstream.KVS)
   197  					} else {
   198  						// To be AWS S3 compatible Select for JSON needs to
   199  						// output non-object JSON as single column value
   200  						// i.e. a map with `_1` as key and value as the
   201  						// non-object.
   202  						kvs = jstream.KVS{jstream.KV{Key: "_1", Value: mv.Value}}
   203  					}
   204  					all = append(all, kvs)
   205  				}
   206  				// We don't need the input any more.
   207  				//nolint:staticcheck // SA6002 Using pointer would allocate more since we would have to copy slice header before taking a pointer.
   208  				r.bufferPool.Put(in.input)
   209  				in.input = nil
   210  				in.err = d.Err()
   211  				in.dst <- all
   212  			}
   213  		}()
   214  	}
   215  }
   216  
   217  // NewPReader - creates new parallel JSON reader using readCloser.
   218  // Should only be used for LINES types.
   219  func NewPReader(readCloser io.ReadCloser, args *ReaderArgs) *PReader {
   220  	r := &PReader{
   221  		args:       args,
   222  		buf:        bufio.NewReaderSize(readCloser, jsonSplitSize*2),
   223  		readCloser: readCloser,
   224  		close:      make(chan struct{}),
   225  	}
   226  	r.startReaders()
   227  	return r
   228  }