github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/properties.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  
    22  	"github.com/apache/arrow/go/v14/arrow"
    23  	"github.com/apache/arrow/go/v14/arrow/memory"
    24  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    25  )
    26  
    27  // ArrowWriterProperties are used to determine how to manipulate the arrow data
    28  // when writing it to a parquet file.
    29  type ArrowWriterProperties struct {
    30  	mem                      memory.Allocator
    31  	timestampAsInt96         bool
    32  	coerceTimestamps         bool
    33  	coerceTimestampUnit      arrow.TimeUnit
    34  	allowTruncatedTimestamps bool
    35  	storeSchema              bool
    36  	noMapLogicalType         bool
    37  	// compliantNestedTypes     bool
    38  }
    39  
    40  // DefaultWriterProps returns the default properties for the arrow writer,
    41  // which are to use memory.DefaultAllocator and coerceTimestampUnit: arrow.Second.
    42  func DefaultWriterProps() ArrowWriterProperties {
    43  	return ArrowWriterProperties{
    44  		mem:                 memory.DefaultAllocator,
    45  		coerceTimestampUnit: arrow.Second,
    46  	}
    47  }
    48  
    49  type config struct {
    50  	props ArrowWriterProperties
    51  }
    52  
    53  // WriterOption is a convenience for building up arrow writer properties
    54  type WriterOption func(*config)
    55  
    56  // NewArrowWriterProperties creates a new writer properties object by passing in
    57  // a set of options to control the properties. Once created, an individual instance
    58  // of ArrowWriterProperties is immutable.
    59  func NewArrowWriterProperties(opts ...WriterOption) ArrowWriterProperties {
    60  	cfg := config{DefaultWriterProps()}
    61  	for _, o := range opts {
    62  		o(&cfg)
    63  	}
    64  	return cfg.props
    65  }
    66  
    67  // WithAllocator specifies the allocator to be used by the writer whenever allocating
    68  // buffers and memory.
    69  func WithAllocator(mem memory.Allocator) WriterOption {
    70  	return func(c *config) {
    71  		c.props.mem = mem
    72  	}
    73  }
    74  
    75  // WithDeprecatedInt96Timestamps allows specifying to enable conversion of arrow timestamps
    76  // to int96 columns when constructing the schema. Since int96 is the impala standard, it's
    77  // technically deprecated in terms of parquet files but is sometimes needed.
    78  func WithDeprecatedInt96Timestamps(enabled bool) WriterOption {
    79  	return func(c *config) {
    80  		c.props.timestampAsInt96 = enabled
    81  	}
    82  }
    83  
    84  // WithCoerceTimestamps enables coercing of timestamp units to a specific time unit
    85  // when constructing the schema and writing data so that regardless of the unit used
    86  // by the datatypes being written, they will be converted to the desired time unit.
    87  func WithCoerceTimestamps(unit arrow.TimeUnit) WriterOption {
    88  	return func(c *config) {
    89  		c.props.coerceTimestamps = true
    90  		c.props.coerceTimestampUnit = unit
    91  	}
    92  }
    93  
    94  // WithTruncatedTimestamps called with true turns off the error that would be returned
    95  // if coercing a timestamp unit would cause a loss of data such as converting from
    96  // nanoseconds to seconds.
    97  func WithTruncatedTimestamps(allow bool) WriterOption {
    98  	return func(c *config) {
    99  		c.props.allowTruncatedTimestamps = allow
   100  	}
   101  }
   102  
   103  // WithStoreSchema enables writing a binary serialized arrow schema to the file in metadata
   104  // to enable certain read options (like "read_dictionary") to be set automatically
   105  //
   106  // If called, the arrow schema is serialized and base64 encoded before being added to the
   107  // metadata of the parquet file with the key "ARROW:schema". If the key exists when
   108  // opening a file for read with pqarrow.FileReader, the schema will be used to choose
   109  // types and options when constructing the arrow schema of the resulting data.
   110  func WithStoreSchema() WriterOption {
   111  	return func(c *config) {
   112  		c.props.storeSchema = true
   113  	}
   114  }
   115  
   116  func WithNoMapLogicalType() WriterOption {
   117  	return func(c *config) {
   118  		c.props.noMapLogicalType = true
   119  	}
   120  }
   121  
   122  // func WithCompliantNestedTypes(enabled bool) WriterOption {
   123  // 	return func(c *config) {
   124  // 		c.props.compliantNestedTypes = enabled
   125  // 	}
   126  // }
   127  
   128  type arrowWriteContext struct {
   129  	props           ArrowWriterProperties
   130  	dataBuffer      *memory.Buffer
   131  	defLevelsBuffer encoding.Buffer
   132  	repLevelsBuffer encoding.Buffer
   133  }
   134  
   135  type arrowCtxKey struct{}
   136  
   137  // NewArrowWriteContext is for creating a re-usable context object that contains writer properties
   138  // and other re-usable buffers for writing. The resulting context should not be used to write
   139  // multiple columns concurrently. If nil is passed, then DefaultWriterProps will be used.
   140  func NewArrowWriteContext(ctx context.Context, props *ArrowWriterProperties) context.Context {
   141  	if props == nil {
   142  		p := DefaultWriterProps()
   143  		props = &p
   144  	}
   145  	return context.WithValue(ctx, arrowCtxKey{}, &arrowWriteContext{props: *props})
   146  }
   147  
   148  func arrowCtxFromContext(ctx context.Context) *arrowWriteContext {
   149  	awc := ctx.Value(arrowCtxKey{})
   150  	if awc != nil {
   151  		return awc.(*arrowWriteContext)
   152  	}
   153  
   154  	return &arrowWriteContext{
   155  		props: DefaultWriterProps(),
   156  	}
   157  }
   158  
   159  // ArrowReadProperties is the properties to define how to read a parquet file
   160  // into arrow arrays.
   161  type ArrowReadProperties struct {
   162  	// If Parallel is true, then functions which read multiple columns will read
   163  	// those columns in parallel from the file with a number of readers equal
   164  	// to the number of columns. Otherwise columns are read serially.
   165  	Parallel bool
   166  	// BatchSize is the size used for calls to NextBatch when reading whole columns
   167  	BatchSize int64
   168  
   169  	readDictIndices map[int]struct{}
   170  }
   171  
   172  // SetReadDict determines whether to read a particular column as dictionary
   173  // encoded or not.
   174  func (props *ArrowReadProperties) SetReadDict(colIdx int, readDict bool) {
   175  	if props.readDictIndices == nil {
   176  		props.readDictIndices = make(map[int]struct{})
   177  	}
   178  
   179  	if readDict {
   180  		props.readDictIndices[colIdx] = struct{}{}
   181  	} else {
   182  		delete(props.readDictIndices, colIdx)
   183  	}
   184  }
   185  
   186  func (props *ArrowReadProperties) ReadDict(colIdx int) bool {
   187  	if props.readDictIndices == nil {
   188  		return false
   189  	}
   190  
   191  	_, ok := props.readDictIndices[colIdx]
   192  	return ok
   193  }