github.com/apache/arrow/go/v14@v14.0.2/parquet/reader_properties.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet 18 19 import ( 20 "bytes" 21 "fmt" 22 "io" 23 24 "github.com/apache/arrow/go/v14/arrow/memory" 25 "github.com/apache/arrow/go/v14/internal/utils" 26 ) 27 28 // ReaderProperties are used to define how the file reader will handle buffering and allocating buffers 29 type ReaderProperties struct { 30 alloc memory.Allocator 31 // Default buffer size to utilize when reading chunks, when reading page 32 // headers or other metadata, this buffer may be increased if necessary 33 // to read in the necessary metadata. The value here is simply the default 34 // initial BufferSize when reading a new chunk. 35 BufferSize int64 36 // create with NewFileDecryptionProperties if dealing with an encrypted file 37 FileDecryptProps *FileDecryptionProperties 38 // If this is set to true, then the reader will use SectionReader to 39 // just use the read stream when reading data. Otherwise we will buffer 40 // the data we're going to read into memory first and then read that buffer. 41 // 42 // If reading from higher latency IO, like S3, it might improve performance to 43 // set this to true in order to read the entire row group in at once rather than 44 // make multiple smaller data requests. For low latency IO streams or if only 45 // reading small portions / subsets of the parquet file, this can be set to false 46 // to reduce the amount of IO performed in order to avoid reading excess amounts of data. 47 BufferedStreamEnabled bool 48 } 49 50 type BufferedReader interface { 51 Peek(int) ([]byte, error) 52 Discard(int) (int, error) 53 io.Reader 54 } 55 56 // NewReaderProperties returns the default Reader Properties using the provided allocator. 57 // 58 // If nil is passed for the allocator, then memory.DefaultAllocator will be used. 59 func NewReaderProperties(alloc memory.Allocator) *ReaderProperties { 60 if alloc == nil { 61 alloc = memory.DefaultAllocator 62 } 63 return &ReaderProperties{alloc, DefaultBufSize, nil, false} 64 } 65 66 // Allocator returns the allocator that the properties were initialized with 67 func (r *ReaderProperties) Allocator() memory.Allocator { return r.alloc } 68 69 // GetStream returns a section of the underlying reader based on whether or not BufferedStream is enabled. 70 // 71 // If BufferedStreamEnabled is true, it creates an io.SectionReader, otherwise it will read the entire section 72 // into a buffer in memory and return a bytes.NewReader for that buffer. 73 func (r *ReaderProperties) GetStream(source io.ReaderAt, start, nbytes int64) (BufferedReader, error) { 74 if r.BufferedStreamEnabled { 75 return utils.NewBufferedReader(io.NewSectionReader(source, start, nbytes), int(r.BufferSize)), nil 76 } 77 78 data := make([]byte, nbytes) 79 n, err := source.ReadAt(data, start) 80 if err != nil { 81 return nil, fmt.Errorf("parquet: tried reading from file, but got error: %w", err) 82 } 83 if n != int(nbytes) { 84 return nil, fmt.Errorf("parquet: tried reading %d bytes starting at position %d from file but only got %d", nbytes, start, n) 85 } 86 87 return utils.NewBufferedReader(bytes.NewReader(data), int(nbytes)), nil 88 }