storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/column.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2018 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package parquet
    18  
    19  import (
    20  	"errors"
    21  	"io"
    22  	"strings"
    23  
    24  	"git.apache.org/thrift.git/lib/go/thrift"
    25  	"github.com/minio/minio-go/v7/pkg/set"
    26  
    27  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    28  )
    29  
    30  func getColumns(
    31  	rowGroup *parquet.RowGroup,
    32  	columnNames set.StringSet,
    33  	schemaElements []*parquet.SchemaElement,
    34  	getReaderFunc GetReaderFunc,
    35  ) (nameColumnMap map[string]*column, err error) {
    36  	nameIndexMap := make(map[string]int)
    37  	for colIndex, columnChunk := range rowGroup.GetColumns() {
    38  		meta := columnChunk.GetMetaData()
    39  		if meta == nil {
    40  			return nil, errors.New("parquet: column metadata missing")
    41  		}
    42  		columnName := strings.Join(meta.GetPathInSchema(), ".")
    43  		if columnNames != nil && !columnNames.Contains(columnName) {
    44  			continue
    45  		}
    46  
    47  		// Ignore column spanning into another file.
    48  		if columnChunk.GetFilePath() != "" {
    49  			continue
    50  		}
    51  
    52  		offset := meta.GetDataPageOffset()
    53  		if meta.DictionaryPageOffset != nil {
    54  			offset = meta.GetDictionaryPageOffset()
    55  		}
    56  
    57  		size := meta.GetTotalCompressedSize()
    58  		if size < 0 {
    59  			return nil, errors.New("parquet: negative compressed size")
    60  		}
    61  		rc, err := getReaderFunc(offset, size)
    62  		if err != nil {
    63  			return nil, err
    64  		}
    65  
    66  		thriftReader := thrift.NewTBufferedTransport(thrift.NewStreamTransportR(rc), int(size))
    67  
    68  		if nameColumnMap == nil {
    69  			nameColumnMap = make(map[string]*column)
    70  		}
    71  		var se *parquet.SchemaElement
    72  		for _, schema := range schemaElements {
    73  			if schema != nil && schema.Name == columnName {
    74  				se = schema
    75  				break
    76  			}
    77  		}
    78  
    79  		nameColumnMap[columnName] = &column{
    80  			name:           columnName,
    81  			metadata:       meta,
    82  			schema:         se,
    83  			schemaElements: schemaElements,
    84  			rc:             rc,
    85  			thriftReader:   thriftReader,
    86  			valueType:      meta.GetType(),
    87  		}
    88  
    89  		// First element of []*parquet.SchemaElement from parquet file metadata is 'schema'
    90  		// which is always skipped, hence colIndex + 1 is valid.
    91  		nameIndexMap[columnName] = colIndex + 1
    92  	}
    93  
    94  	for name := range nameColumnMap {
    95  		nameColumnMap[name].nameIndexMap = nameIndexMap
    96  	}
    97  
    98  	return nameColumnMap, nil
    99  }
   100  
   101  type column struct {
   102  	name           string
   103  	endOfValues    bool
   104  	valueIndex     int
   105  	valueType      parquet.Type
   106  	metadata       *parquet.ColumnMetaData
   107  	schema         *parquet.SchemaElement
   108  	schemaElements []*parquet.SchemaElement
   109  	nameIndexMap   map[string]int
   110  	dictPage       *page
   111  	dataTable      *table
   112  	rc             io.ReadCloser
   113  	thriftReader   *thrift.TBufferedTransport
   114  }
   115  
   116  func (column *column) close() (err error) {
   117  	if column.rc != nil {
   118  		err = column.rc.Close()
   119  		column.rc = nil
   120  	}
   121  
   122  	return err
   123  }
   124  
   125  func (column *column) readPage() {
   126  	page, _, _, err := readPage(
   127  		column.thriftReader,
   128  		column.metadata,
   129  		column.nameIndexMap,
   130  		column.schemaElements,
   131  	)
   132  
   133  	if err != nil {
   134  		column.endOfValues = true
   135  		return
   136  	}
   137  
   138  	if page.Header.GetType() == parquet.PageType_DICTIONARY_PAGE {
   139  		column.dictPage = page
   140  		column.readPage()
   141  		return
   142  	}
   143  
   144  	page.decode(column.dictPage)
   145  
   146  	if column.dataTable == nil {
   147  		column.dataTable = newTableFromTable(page.DataTable)
   148  	}
   149  
   150  	column.dataTable.Merge(page.DataTable)
   151  }
   152  
   153  func (column *column) read() (value interface{}, valueType parquet.Type, cnv *parquet.SchemaElement) {
   154  	if column.dataTable == nil {
   155  		column.readPage()
   156  		column.valueIndex = 0
   157  	}
   158  
   159  	if column.endOfValues {
   160  		return nil, column.metadata.GetType(), column.schema
   161  	}
   162  
   163  	value = column.dataTable.Values[column.valueIndex]
   164  	column.valueIndex++
   165  	if len(column.dataTable.Values) == column.valueIndex {
   166  		column.dataTable = nil
   167  	}
   168  
   169  	return value, column.metadata.GetType(), column.schema
   170  }