github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/s3select/parquet/reader.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package parquet 19 20 import ( 21 "errors" 22 "io" 23 "time" 24 25 "github.com/bcicen/jstream" 26 parquetgo "github.com/fraugster/parquet-go" 27 parquettypes "github.com/fraugster/parquet-go/parquet" 28 jsonfmt "github.com/minio/minio/internal/s3select/json" 29 "github.com/minio/minio/internal/s3select/sql" 30 ) 31 32 // Reader implements reading records from parquet input. 33 type Reader struct { 34 io.Closer 35 r *parquetgo.FileReader 36 } 37 38 // NewParquetReader creates a Reader2 from a io.ReadSeekCloser. 39 func NewParquetReader(rsc io.ReadSeekCloser, _ *ReaderArgs) (r *Reader, err error) { 40 fr, err := parquetgo.NewFileReader(rsc) 41 if err != nil { 42 return nil, errParquetParsingError(err) 43 } 44 45 return &Reader{Closer: rsc, r: fr}, nil 46 } 47 48 func (pr *Reader) Read(dst sql.Record) (rec sql.Record, rerr error) { 49 nextRow, err := pr.r.NextRow() 50 if err != nil { 51 if err == io.EOF { 52 return nil, err 53 } 54 return nil, errParquetParsingError(err) 55 } 56 57 kvs := jstream.KVS{} 58 for _, col := range pr.r.Columns() { 59 60 var value interface{} 61 if v, ok := nextRow[col.FlatName()]; ok { 62 value, err = convertFromAnnotation(col.Element(), v) 63 if err != nil { 64 return nil, errParquetParsingError(err) 65 } 66 } 67 kvs = append(kvs, jstream.KV{Key: col.FlatName(), Value: value}) 68 } 69 70 // Reuse destination if we can. 71 dstRec, ok := dst.(*jsonfmt.Record) 72 if !ok { 73 dstRec = &jsonfmt.Record{} 74 } 75 dstRec.SelectFormat = sql.SelectFmtParquet 76 dstRec.KVS = kvs 77 return dstRec, nil 78 } 79 80 // convertFromAnnotation - converts values based on the Parquet column's type 81 // annotations. LogicalType annotations if present override the deprecated 82 // ConvertedType annotations. Ref: 83 // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md 84 func convertFromAnnotation(se *parquettypes.SchemaElement, v interface{}) (interface{}, error) { 85 if se == nil { 86 return v, nil 87 } 88 89 var value interface{} 90 switch val := v.(type) { 91 case []byte: 92 // TODO: only strings are supported in s3select output (not 93 // binary arrays) - perhaps we need to check the annotation to 94 // ensure it's UTF8 encoded. 95 value = string(val) 96 case [12]byte: 97 // TODO: This is returned for the parquet INT96 type. We just 98 // treat it same as []byte (but AWS S3 treats it as a large int) 99 // - fix this later. 100 value = string(val[:]) 101 case int32: 102 value = int64(val) 103 if logicalType := se.GetLogicalType(); logicalType != nil { 104 if logicalType.IsSetDATE() { 105 value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(val), 0).UTC()) 106 } 107 } else if se.GetConvertedType() == parquettypes.ConvertedType_DATE { 108 value = sql.FormatSQLTimestamp(time.Unix(60*60*24*int64(val), 0).UTC()) 109 } 110 case int64: 111 value = val 112 if logicalType := se.GetLogicalType(); logicalType != nil { 113 if ts := logicalType.GetTIMESTAMP(); ts != nil { 114 var duration time.Duration 115 // Only support UTC normalized timestamps. 116 if ts.IsAdjustedToUTC { 117 switch { 118 case ts.Unit.IsSetNANOS(): 119 duration = time.Duration(val) * time.Nanosecond 120 case ts.Unit.IsSetMILLIS(): 121 duration = time.Duration(val) * time.Millisecond 122 case ts.Unit.IsSetMICROS(): 123 duration = time.Duration(val) * time.Microsecond 124 default: 125 return nil, errors.New("Invalid LogicalType annotation found") 126 } 127 value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration)) 128 } 129 } else if se.GetConvertedType() == parquettypes.ConvertedType_TIMESTAMP_MILLIS { 130 duration := time.Duration(val) * time.Millisecond 131 value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration)) 132 } else if se.GetConvertedType() == parquettypes.ConvertedType_TIMESTAMP_MICROS { 133 duration := time.Duration(val) * time.Microsecond 134 value = sql.FormatSQLTimestamp(time.Unix(0, 0).Add(duration)) 135 } 136 } 137 case float32: 138 value = float64(val) 139 default: 140 value = v 141 } 142 return value, nil 143 }