github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/parquet/reader.go (about)

     1  // Copyright (c) 2015-2021 MinIO, Inc.
     2  //
     3  // This file is part of MinIO Object Storage stack
     4  //
     5  // This program is free software: you can redistribute it and/or modify
     6  // it under the terms of the GNU Affero General Public License as published by
     7  // the Free Software Foundation, either version 3 of the License, or
     8  // (at your option) any later version.
     9  //
    10  // This program is distributed in the hope that it will be useful
    11  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13  // GNU Affero General Public License for more details.
    14  //
    15  // You should have received a copy of the GNU Affero General Public License
    16  // along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17  
    18  package parquet
    19  
    20  import (
    21  	"errors"
    22  	"io"
    23  	"time"
    24  
    25  	"github.com/bcicen/jstream"
    26  	parquetgo "github.com/fraugster/parquet-go"
    27  	parquettypes "github.com/fraugster/parquet-go/parquet"
    28  	jsonfmt "github.com/minio/minio/internal/s3select/json"
    29  	"github.com/minio/minio/internal/s3select/sql"
    30  )
    31  
    32  // Reader implements reading records from parquet input.
    33  type Reader struct {
    34  	io.Closer
    35  	r *parquetgo.FileReader
    36  }
    37  
    38  // NewParquetReader creates a Reader2 from a io.ReadSeekCloser.
    39  func NewParquetReader(rsc io.ReadSeekCloser, _ *ReaderArgs) (r *Reader, err error) {
    40  	fr, err := parquetgo.NewFileReader(rsc)
    41  	if err != nil {
    42  		return nil, errParquetParsingError(err)
    43  	}
    44  
    45  	return &Reader{Closer: rsc, r: fr}, nil
    46  }
    47  
    48  func (pr *Reader) Read(dst sql.Record) (rec sql.Record, rerr error) {
    49  	nextRow, err := pr.r.NextRow()
    50  	if err != nil {
    51  		if err == io.EOF {
    52  			return nil, err
    53  		}
    54  		return nil, errParquetParsingError(err)
    55  	}
    56  
    57  	kvs := jstream.KVS{}
    58  	for _, col := range pr.r.Columns() {
    59  
    60  		var value interface{}
    61  		if v, ok := nextRow[col.FlatName()]; ok {
    62  			value, err = convertFromAnnotation(col.Element(), v)
    63  			if err != nil {
    64  				return nil, errParquetParsingError(err)
    65  			}
    66  		}
    67  		kvs = append(kvs, jstream.KV{Key: col.FlatName(), Value: value})
    68  	}
    69  
    70  	// Reuse destination if we can.
    71  	dstRec, ok := dst.(*jsonfmt.Record)
    72  	if !ok {
    73  		dstRec = &jsonfmt.Record{}
    74  	}
    75  	dstRec.SelectFormat = sql.SelectFmtParquet
    76  	dstRec.KVS = kvs
    77  	return dstRec, nil
    78  }
    79  
    80  // convertFromAnnotation - converts values based on the Parquet column's type
    81  // annotations. LogicalType annotations if present override the deprecated
    82  // ConvertedType annotations. Ref:
    83  // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
    84  func convertFromAnnotation(se *parquettypes.SchemaElement, v interface{}) (interface{}, error) {
    85  	if se == nil {
    86  		return v, nil
    87  	}
    88  
    89  	var value interface{}
    90  	switch val := v.(type) {
    91  	case []byte:
    92  		// TODO: only strings are supported in s3select output (not
    93  		// binary arrays) - perhaps we need to check the annotation to
    94  		// ensure it's UTF8 encoded.
    95  		value = string(val)
    96  	case [12]byte:
    97  		// TODO: This is returned for the parquet INT96 type. We just
    98  		// treat it same as []byte (but AWS S3 treats it as a large int)
    99  		// - fix this later.
   100  		value = string(val[:])
   101  	case int32:
   102  		value = int64(val)
   103  		if logicalType := se.GetLogicalType(); logicalType != nil {
   104  			if logicalType.IsSetDATE() {
   105  				value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(val), 0).UTC())
   106  			}
   107  		} else if se.GetConvertedType() == parquettypes.ConvertedType_DATE {
   108  			value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(val), 0).UTC())
   109  		}
   110  	case int64:
   111  		value = val
   112  		if logicalType := se.GetLogicalType(); logicalType != nil {
   113  			if ts := logicalType.GetTIMESTAMP(); ts != nil {
   114  				var duration time.Duration
   115  				// Only support UTC normalized timestamps.
   116  				if ts.IsAdjustedToUTC {
   117  					switch {
   118  					case ts.Unit.IsSetNANOS():
   119  						duration = time.Duration(val) * time.Nanosecond
   120  					case ts.Unit.IsSetMILLIS():
   121  						duration = time.Duration(val) * time.Millisecond
   122  					case ts.Unit.IsSetMICROS():
   123  						duration = time.Duration(val) * time.Microsecond
   124  					default:
   125  						return nil, errors.New("Invalid LogicalType annotation found")
   126  					}
   127  					value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration))
   128  				}
   129  			} else if se.GetConvertedType() == parquettypes.ConvertedType_TIMESTAMP_MILLIS {
   130  				duration := time.Duration(val) * time.Millisecond
   131  				value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration))
   132  			} else if se.GetConvertedType() == parquettypes.ConvertedType_TIMESTAMP_MICROS {
   133  				duration := time.Duration(val) * time.Microsecond
   134  				value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration))
   135  			}
   136  		}
   137  	case float32:
   138  		value = float64(val)
   139  	default:
   140  		value = v
   141  	}
   142  	return value, nil
   143  }