storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/reader.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2018 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package parquet 18 19 import ( 20 "encoding/binary" 21 "encoding/json" 22 "io" 23 24 "git.apache.org/thrift.git/lib/go/thrift" 25 "github.com/minio/minio-go/v7/pkg/set" 26 27 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 28 ) 29 30 // GetReaderFunc - function type returning io.ReadCloser for requested offset/length. 31 type GetReaderFunc func(offset, length int64) (io.ReadCloser, error) 32 33 func footerSize(getReaderFunc GetReaderFunc) (size int64, err error) { 34 rc, err := getReaderFunc(-8, 4) 35 if err != nil { 36 return 0, err 37 } 38 defer rc.Close() 39 40 buf := make([]byte, 4) 41 if _, err = io.ReadFull(rc, buf); err != nil { 42 return 0, err 43 } 44 45 size = int64(binary.LittleEndian.Uint32(buf)) 46 47 return size, nil 48 } 49 50 func fileMetadata(getReaderFunc GetReaderFunc) (*parquet.FileMetaData, error) { 51 size, err := footerSize(getReaderFunc) 52 if err != nil { 53 return nil, err 54 } 55 56 rc, err := getReaderFunc(-(8 + size), size) 57 if err != nil { 58 return nil, err 59 } 60 defer rc.Close() 61 62 fileMeta := parquet.NewFileMetaData() 63 64 pf := thrift.NewTCompactProtocolFactory() 65 protocol := pf.GetProtocol(thrift.NewStreamTransportR(rc)) 66 err = fileMeta.Read(protocol) 67 if err != nil { 68 return nil, err 69 } 70 71 return fileMeta, nil 72 } 73 74 // Value - denotes column value 75 type Value struct { 76 Value interface{} 77 Type parquet.Type 78 Schema *parquet.SchemaElement 79 } 80 81 // MarshalJSON - encodes to JSON data 82 func (value Value) MarshalJSON() (data []byte, err error) { 83 return json.Marshal(value.Value) 84 } 85 86 // Reader - denotes parquet file. 87 type Reader struct { 88 getReaderFunc GetReaderFunc 89 schemaElements []*parquet.SchemaElement 90 rowGroups []*parquet.RowGroup 91 rowGroupIndex int 92 93 nameList []string 94 columnNames set.StringSet 95 columns map[string]*column 96 rowIndex int64 97 } 98 99 // NewReader - creates new parquet reader. Reader calls getReaderFunc to get required data range for given columnNames. If columnNames is empty, all columns are used. 100 func NewReader(getReaderFunc GetReaderFunc, columnNames set.StringSet) (*Reader, error) { 101 fileMeta, err := fileMetadata(getReaderFunc) 102 if err != nil { 103 return nil, err 104 } 105 106 nameList := []string{} 107 schemaElements := fileMeta.GetSchema() 108 for _, element := range schemaElements { 109 nameList = append(nameList, element.Name) 110 } 111 112 return &Reader{ 113 getReaderFunc: getReaderFunc, 114 rowGroups: fileMeta.GetRowGroups(), 115 schemaElements: schemaElements, 116 nameList: nameList, 117 columnNames: columnNames, 118 }, nil 119 } 120 121 // Read - reads single record. 122 func (reader *Reader) Read() (record *Record, err error) { 123 if reader.rowGroupIndex >= len(reader.rowGroups) { 124 return nil, io.EOF 125 } 126 127 if reader.columns == nil { 128 reader.columns, err = getColumns( 129 reader.rowGroups[reader.rowGroupIndex], 130 reader.columnNames, 131 reader.schemaElements, 132 reader.getReaderFunc, 133 ) 134 if err != nil { 135 return nil, err 136 } 137 138 reader.rowIndex = 0 139 } 140 141 if reader.rowIndex >= reader.rowGroups[reader.rowGroupIndex].GetNumRows() { 142 reader.rowGroupIndex++ 143 reader.Close() 144 return reader.Read() 145 } 146 147 record = newRecord(reader.nameList) 148 for name := range reader.columns { 149 col := reader.columns[name] 150 value, valueType, schema := col.read() 151 record.set(name, Value{Value: value, Type: valueType, Schema: schema}) 152 } 153 154 reader.rowIndex++ 155 156 return record, nil 157 } 158 159 // Close - closes underneath readers. 160 func (reader *Reader) Close() (err error) { 161 for _, column := range reader.columns { 162 column.close() 163 } 164 165 reader.columns = nil 166 reader.rowIndex = 0 167 168 return nil 169 }