github.com/apache/arrow/go/v7@v7.0.1/parquet/reader_properties.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet
    18  
    19  import (
    20  	"bytes"
    21  	"io"
    22  
    23  	"github.com/apache/arrow/go/v7/arrow/memory"
    24  	"golang.org/x/xerrors"
    25  )
    26  
    27  // ReaderProperties are used to define how the file reader will handle buffering and allocating buffers
    28  type ReaderProperties struct {
    29  	alloc memory.Allocator
    30  	// Default buffer size to utilize when reading chunks
    31  	BufferSize int64
    32  	// create with NewFileDecryptionProperties if dealing with an encrypted file
    33  	FileDecryptProps *FileDecryptionProperties
    34  	// If this is set to true, then the reader will use SectionReader to
    35  	// just use the read stream when reading data. Otherwise we will buffer
    36  	// the data we're going to read into memory first and then read that buffer.
    37  	//
    38  	// If reading from higher latency IO, like S3, it might improve performance to
    39  	// set this to true in order to read the entire row group in at once rather than
    40  	// make multiple smaller data requests. For low latency IO streams or if only
    41  	// reading small portions / subsets  of the parquet file, this can be set to false
    42  	// to reduce the amount of IO performed in order to avoid reading excess amounts of data.
    43  	BufferedStreamEnabled bool
    44  }
    45  
    46  // NewReaderProperties returns the default Reader Properties using the provided allocator.
    47  //
    48  // If nil is passed for the allocator, then memory.DefaultAllocator will be used.
    49  func NewReaderProperties(alloc memory.Allocator) *ReaderProperties {
    50  	if alloc == nil {
    51  		alloc = memory.DefaultAllocator
    52  	}
    53  	return &ReaderProperties{alloc, DefaultBufSize, nil, false}
    54  }
    55  
    56  // Allocator returns the allocator that the properties were initialized with
    57  func (r *ReaderProperties) Allocator() memory.Allocator { return r.alloc }
    58  
    59  // GetStream returns a section of the underlying reader based on whether or not BufferedStream is enabled.
    60  //
    61  // If BufferedStreamEnabled is true, it creates an io.SectionReader, otherwise it will read the entire section
    62  // into a buffer in memory and return a bytes.NewReader for that buffer.
    63  func (r *ReaderProperties) GetStream(source io.ReaderAt, start, nbytes int64) (ReaderAtSeeker, error) {
    64  	if r.BufferedStreamEnabled {
    65  		return io.NewSectionReader(source, start, nbytes), nil
    66  	}
    67  
    68  	data := make([]byte, nbytes)
    69  	n, err := source.ReadAt(data, start)
    70  	if err != nil {
    71  		return nil, xerrors.Errorf("parquet: tried reading from file, but got error: %w", err)
    72  	}
    73  	if n != int(nbytes) {
    74  		return nil, xerrors.Errorf("parquet: tried reading %d bytes starting at position %d from file but only got %d", nbytes, start, n)
    75  	}
    76  
    77  	return bytes.NewReader(data), nil
    78  }