storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/column.go (about) 1 /* 2 * Minio Cloud Storage, (C) 2018 Minio, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package parquet 18 19 import ( 20 "errors" 21 "io" 22 "strings" 23 24 "git.apache.org/thrift.git/lib/go/thrift" 25 "github.com/minio/minio-go/v7/pkg/set" 26 27 "storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet" 28 ) 29 30 func getColumns( 31 rowGroup *parquet.RowGroup, 32 columnNames set.StringSet, 33 schemaElements []*parquet.SchemaElement, 34 getReaderFunc GetReaderFunc, 35 ) (nameColumnMap map[string]*column, err error) { 36 nameIndexMap := make(map[string]int) 37 for colIndex, columnChunk := range rowGroup.GetColumns() { 38 meta := columnChunk.GetMetaData() 39 if meta == nil { 40 return nil, errors.New("parquet: column metadata missing") 41 } 42 columnName := strings.Join(meta.GetPathInSchema(), ".") 43 if columnNames != nil && !columnNames.Contains(columnName) { 44 continue 45 } 46 47 // Ignore column spanning into another file. 48 if columnChunk.GetFilePath() != "" { 49 continue 50 } 51 52 offset := meta.GetDataPageOffset() 53 if meta.DictionaryPageOffset != nil { 54 offset = meta.GetDictionaryPageOffset() 55 } 56 57 size := meta.GetTotalCompressedSize() 58 if size < 0 { 59 return nil, errors.New("parquet: negative compressed size") 60 } 61 rc, err := getReaderFunc(offset, size) 62 if err != nil { 63 return nil, err 64 } 65 66 thriftReader := thrift.NewTBufferedTransport(thrift.NewStreamTransportR(rc), int(size)) 67 68 if nameColumnMap == nil { 69 nameColumnMap = make(map[string]*column) 70 } 71 var se *parquet.SchemaElement 72 for _, schema := range schemaElements { 73 if schema != nil && schema.Name == columnName { 74 se = schema 75 break 76 } 77 } 78 79 nameColumnMap[columnName] = &column{ 80 name: columnName, 81 metadata: meta, 82 schema: se, 83 schemaElements: schemaElements, 84 rc: rc, 85 thriftReader: thriftReader, 86 valueType: meta.GetType(), 87 } 88 89 // First element of []*parquet.SchemaElement from parquet file metadata is 'schema' 90 // which is always skipped, hence colIndex + 1 is valid. 91 nameIndexMap[columnName] = colIndex + 1 92 } 93 94 for name := range nameColumnMap { 95 nameColumnMap[name].nameIndexMap = nameIndexMap 96 } 97 98 return nameColumnMap, nil 99 } 100 101 type column struct { 102 name string 103 endOfValues bool 104 valueIndex int 105 valueType parquet.Type 106 metadata *parquet.ColumnMetaData 107 schema *parquet.SchemaElement 108 schemaElements []*parquet.SchemaElement 109 nameIndexMap map[string]int 110 dictPage *page 111 dataTable *table 112 rc io.ReadCloser 113 thriftReader *thrift.TBufferedTransport 114 } 115 116 func (column *column) close() (err error) { 117 if column.rc != nil { 118 err = column.rc.Close() 119 column.rc = nil 120 } 121 122 return err 123 } 124 125 func (column *column) readPage() { 126 page, _, _, err := readPage( 127 column.thriftReader, 128 column.metadata, 129 column.nameIndexMap, 130 column.schemaElements, 131 ) 132 133 if err != nil { 134 column.endOfValues = true 135 return 136 } 137 138 if page.Header.GetType() == parquet.PageType_DICTIONARY_PAGE { 139 column.dictPage = page 140 column.readPage() 141 return 142 } 143 144 page.decode(column.dictPage) 145 146 if column.dataTable == nil { 147 column.dataTable = newTableFromTable(page.DataTable) 148 } 149 150 column.dataTable.Merge(page.DataTable) 151 } 152 153 func (column *column) read() (value interface{}, valueType parquet.Type, cnv *parquet.SchemaElement) { 154 if column.dataTable == nil { 155 column.readPage() 156 column.valueIndex = 0 157 } 158 159 if column.endOfValues { 160 return nil, column.metadata.GetType(), column.schema 161 } 162 163 value = column.dataTable.Values[column.valueIndex] 164 column.valueIndex++ 165 if len(column.dataTable.Values) == column.valueIndex { 166 column.dataTable = nil 167 } 168 169 return value, column.metadata.GetType(), column.schema 170 }