storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/parquet/reader.go (about)

     1  /*
     2   * MinIO Cloud Storage, (C) 2019 MinIO, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package parquet
    18  
    19  import (
    20  	"fmt"
    21  	"io"
    22  	"time"
    23  
    24  	"github.com/bcicen/jstream"
    25  
    26  	parquetgo "storj.io/minio/pkg/s3select/internal/parquet-go"
    27  	parquetgen "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    28  	jsonfmt "storj.io/minio/pkg/s3select/json"
    29  	"storj.io/minio/pkg/s3select/sql"
    30  )
    31  
    32  // Reader - Parquet record reader for S3Select.
    33  type Reader struct {
    34  	args   *ReaderArgs
    35  	reader *parquetgo.Reader
    36  }
    37  
    38  // Read - reads single record.
    39  func (r *Reader) Read(dst sql.Record) (rec sql.Record, rerr error) {
    40  	defer func() {
    41  		if rec := recover(); rec != nil {
    42  			rerr = fmt.Errorf("panic reading parquet record: %v", rec)
    43  		}
    44  	}()
    45  
    46  	parquetRecord, err := r.reader.Read()
    47  	if err != nil {
    48  		if err != io.EOF {
    49  			return nil, errParquetParsingError(err)
    50  		}
    51  
    52  		return nil, err
    53  	}
    54  
    55  	kvs := jstream.KVS{}
    56  	f := func(name string, v parquetgo.Value) bool {
    57  		if v.Value == nil {
    58  			kvs = append(kvs, jstream.KV{Key: name, Value: nil})
    59  			return true
    60  		}
    61  
    62  		var value interface{}
    63  		switch v.Type {
    64  		case parquetgen.Type_BOOLEAN:
    65  			value = v.Value.(bool)
    66  		case parquetgen.Type_INT32:
    67  			value = int64(v.Value.(int32))
    68  			if v.Schema != nil && v.Schema.ConvertedType != nil {
    69  				switch *v.Schema.ConvertedType {
    70  				case parquetgen.ConvertedType_DATE:
    71  					value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(v.Value.(int32)), 0).UTC())
    72  				}
    73  			}
    74  		case parquetgen.Type_INT64:
    75  			value = v.Value.(int64)
    76  			if v.Schema != nil && v.Schema.ConvertedType != nil {
    77  				switch *v.Schema.ConvertedType {
    78  				// Only UTC supported, add one NS to never be exactly midnight.
    79  				case parquetgen.ConvertedType_TIMESTAMP_MILLIS:
    80  					value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(time.Duration(v.Value.(int64)) * time.Millisecond).UTC())
    81  				case parquetgen.ConvertedType_TIMESTAMP_MICROS:
    82  					value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(time.Duration(v.Value.(int64)) * time.Microsecond).UTC())
    83  				}
    84  			}
    85  		case parquetgen.Type_FLOAT:
    86  			value = float64(v.Value.(float32))
    87  		case parquetgen.Type_DOUBLE:
    88  			value = v.Value.(float64)
    89  		case parquetgen.Type_INT96, parquetgen.Type_BYTE_ARRAY, parquetgen.Type_FIXED_LEN_BYTE_ARRAY:
    90  			value = string(v.Value.([]byte))
    91  		default:
    92  			rerr = errParquetParsingError(nil)
    93  			return false
    94  		}
    95  
    96  		kvs = append(kvs, jstream.KV{Key: name, Value: value})
    97  		return true
    98  	}
    99  
   100  	// Apply our range
   101  	parquetRecord.Range(f)
   102  
   103  	// Reuse destination if we can.
   104  	dstRec, ok := dst.(*jsonfmt.Record)
   105  	if !ok {
   106  		dstRec = &jsonfmt.Record{}
   107  	}
   108  	dstRec.SelectFormat = sql.SelectFmtParquet
   109  	dstRec.KVS = kvs
   110  	return dstRec, nil
   111  }
   112  
   113  // Close - closes underlying readers.
   114  func (r *Reader) Close() error {
   115  	return r.reader.Close()
   116  }
   117  
   118  // NewReader - creates new Parquet reader using readerFunc callback.
   119  func NewReader(getReaderFunc func(offset, length int64) (io.ReadCloser, error), args *ReaderArgs) (r *Reader, err error) {
   120  	defer func() {
   121  		if rec := recover(); rec != nil {
   122  			err = fmt.Errorf("panic reading parquet header: %v", rec)
   123  		}
   124  	}()
   125  	reader, err := parquetgo.NewReader(getReaderFunc, nil)
   126  	if err != nil {
   127  		if err != io.EOF {
   128  			return nil, errParquetParsingError(err)
   129  		}
   130  
   131  		return nil, err
   132  	}
   133  
   134  	return &Reader{
   135  		args:   args,
   136  		reader: reader,
   137  	}, nil
   138  }