storj.io/minio@v0.0.0-20230509071714-0cbc90f649b1/pkg/s3select/internal/parquet-go/reader.go (about)

     1  /*
     2   * Minio Cloud Storage, (C) 2018 Minio, Inc.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package parquet
    18  
    19  import (
    20  	"encoding/binary"
    21  	"encoding/json"
    22  	"io"
    23  
    24  	"git.apache.org/thrift.git/lib/go/thrift"
    25  	"github.com/minio/minio-go/v7/pkg/set"
    26  
    27  	"storj.io/minio/pkg/s3select/internal/parquet-go/gen-go/parquet"
    28  )
    29  
    30  // GetReaderFunc - function type returning io.ReadCloser for requested offset/length.
    31  type GetReaderFunc func(offset, length int64) (io.ReadCloser, error)
    32  
    33  func footerSize(getReaderFunc GetReaderFunc) (size int64, err error) {
    34  	rc, err := getReaderFunc(-8, 4)
    35  	if err != nil {
    36  		return 0, err
    37  	}
    38  	defer rc.Close()
    39  
    40  	buf := make([]byte, 4)
    41  	if _, err = io.ReadFull(rc, buf); err != nil {
    42  		return 0, err
    43  	}
    44  
    45  	size = int64(binary.LittleEndian.Uint32(buf))
    46  
    47  	return size, nil
    48  }
    49  
    50  func fileMetadata(getReaderFunc GetReaderFunc) (*parquet.FileMetaData, error) {
    51  	size, err := footerSize(getReaderFunc)
    52  	if err != nil {
    53  		return nil, err
    54  	}
    55  
    56  	rc, err := getReaderFunc(-(8 + size), size)
    57  	if err != nil {
    58  		return nil, err
    59  	}
    60  	defer rc.Close()
    61  
    62  	fileMeta := parquet.NewFileMetaData()
    63  
    64  	pf := thrift.NewTCompactProtocolFactory()
    65  	protocol := pf.GetProtocol(thrift.NewStreamTransportR(rc))
    66  	err = fileMeta.Read(protocol)
    67  	if err != nil {
    68  		return nil, err
    69  	}
    70  
    71  	return fileMeta, nil
    72  }
    73  
    74  // Value - denotes column value
    75  type Value struct {
    76  	Value  interface{}
    77  	Type   parquet.Type
    78  	Schema *parquet.SchemaElement
    79  }
    80  
    81  // MarshalJSON - encodes to JSON data
    82  func (value Value) MarshalJSON() (data []byte, err error) {
    83  	return json.Marshal(value.Value)
    84  }
    85  
    86  // Reader - denotes parquet file.
    87  type Reader struct {
    88  	getReaderFunc  GetReaderFunc
    89  	schemaElements []*parquet.SchemaElement
    90  	rowGroups      []*parquet.RowGroup
    91  	rowGroupIndex  int
    92  
    93  	nameList    []string
    94  	columnNames set.StringSet
    95  	columns     map[string]*column
    96  	rowIndex    int64
    97  }
    98  
    99  // NewReader - creates new parquet reader. Reader calls getReaderFunc to get required data range for given columnNames. If columnNames is empty, all columns are used.
   100  func NewReader(getReaderFunc GetReaderFunc, columnNames set.StringSet) (*Reader, error) {
   101  	fileMeta, err := fileMetadata(getReaderFunc)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  
   106  	nameList := []string{}
   107  	schemaElements := fileMeta.GetSchema()
   108  	for _, element := range schemaElements {
   109  		nameList = append(nameList, element.Name)
   110  	}
   111  
   112  	return &Reader{
   113  		getReaderFunc:  getReaderFunc,
   114  		rowGroups:      fileMeta.GetRowGroups(),
   115  		schemaElements: schemaElements,
   116  		nameList:       nameList,
   117  		columnNames:    columnNames,
   118  	}, nil
   119  }
   120  
   121  // Read - reads single record.
   122  func (reader *Reader) Read() (record *Record, err error) {
   123  	if reader.rowGroupIndex >= len(reader.rowGroups) {
   124  		return nil, io.EOF
   125  	}
   126  
   127  	if reader.columns == nil {
   128  		reader.columns, err = getColumns(
   129  			reader.rowGroups[reader.rowGroupIndex],
   130  			reader.columnNames,
   131  			reader.schemaElements,
   132  			reader.getReaderFunc,
   133  		)
   134  		if err != nil {
   135  			return nil, err
   136  		}
   137  
   138  		reader.rowIndex = 0
   139  	}
   140  
   141  	if reader.rowIndex >= reader.rowGroups[reader.rowGroupIndex].GetNumRows() {
   142  		reader.rowGroupIndex++
   143  		reader.Close()
   144  		return reader.Read()
   145  	}
   146  
   147  	record = newRecord(reader.nameList)
   148  	for name := range reader.columns {
   149  		col := reader.columns[name]
   150  		value, valueType, schema := col.read()
   151  		record.set(name, Value{Value: value, Type: valueType, Schema: schema})
   152  	}
   153  
   154  	reader.rowIndex++
   155  
   156  	return record, nil
   157  }
   158  
   159  // Close - closes underneath readers.
   160  func (reader *Reader) Close() (err error) {
   161  	for _, column := range reader.columns {
   162  		column.close()
   163  	}
   164  
   165  	reader.columns = nil
   166  	reader.rowIndex = 0
   167  
   168  	return nil
   169  }