github.com/apache/arrow/go/v14@v14.0.2/parquet/reader_properties.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet
    18  
    19  import (
    20  	"bytes"
    21  	"fmt"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v14/arrow/memory"
    25  	"github.com/apache/arrow/go/v14/internal/utils"
    26  )
    27  
    28  // ReaderProperties are used to define how the file reader will handle buffering and allocating buffers
    29  type ReaderProperties struct {
    30  	alloc memory.Allocator
    31  	// Default buffer size to utilize when reading chunks, when reading page
    32  	// headers or other metadata, this buffer may be increased if necessary
    33  	// to read in the necessary metadata. The value here is simply the default
    34  	// initial BufferSize when reading a new chunk.
    35  	BufferSize int64
    36  	// create with NewFileDecryptionProperties if dealing with an encrypted file
    37  	FileDecryptProps *FileDecryptionProperties
    38  	// If this is set to true, then the reader will use SectionReader to
    39  	// just use the read stream when reading data. Otherwise we will buffer
    40  	// the data we're going to read into memory first and then read that buffer.
    41  	//
    42  	// If reading from higher latency IO, like S3, it might improve performance to
    43  	// set this to true in order to read the entire row group in at once rather than
    44  	// make multiple smaller data requests. For low latency IO streams or if only
    45  	// reading small portions / subsets  of the parquet file, this can be set to false
    46  	// to reduce the amount of IO performed in order to avoid reading excess amounts of data.
    47  	BufferedStreamEnabled bool
    48  }
    49  
    50  type BufferedReader interface {
    51  	Peek(int) ([]byte, error)
    52  	Discard(int) (int, error)
    53  	io.Reader
    54  }
    55  
    56  // NewReaderProperties returns the default Reader Properties using the provided allocator.
    57  //
    58  // If nil is passed for the allocator, then memory.DefaultAllocator will be used.
    59  func NewReaderProperties(alloc memory.Allocator) *ReaderProperties {
    60  	if alloc == nil {
    61  		alloc = memory.DefaultAllocator
    62  	}
    63  	return &ReaderProperties{alloc, DefaultBufSize, nil, false}
    64  }
    65  
    66  // Allocator returns the allocator that the properties were initialized with
    67  func (r *ReaderProperties) Allocator() memory.Allocator { return r.alloc }
    68  
    69  // GetStream returns a section of the underlying reader based on whether or not BufferedStream is enabled.
    70  //
    71  // If BufferedStreamEnabled is true, it creates an io.SectionReader, otherwise it will read the entire section
    72  // into a buffer in memory and return a bytes.NewReader for that buffer.
    73  func (r *ReaderProperties) GetStream(source io.ReaderAt, start, nbytes int64) (BufferedReader, error) {
    74  	if r.BufferedStreamEnabled {
    75  		return utils.NewBufferedReader(io.NewSectionReader(source, start, nbytes), int(r.BufferSize)), nil
    76  	}
    77  
    78  	data := make([]byte, nbytes)
    79  	n, err := source.ReadAt(data, start)
    80  	if err != nil {
    81  		return nil, fmt.Errorf("parquet: tried reading from file, but got error: %w", err)
    82  	}
    83  	if n != int(nbytes) {
    84  		return nil, fmt.Errorf("parquet: tried reading %d bytes starting at position %d from file but only got %d", nbytes, start, n)
    85  	}
    86  
    87  	return utils.NewBufferedReader(bytes.NewReader(data), int(nbytes)), nil
    88  }