github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/properties.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "context" 21 22 "github.com/apache/arrow/go/v14/arrow" 23 "github.com/apache/arrow/go/v14/arrow/memory" 24 "github.com/apache/arrow/go/v14/parquet/internal/encoding" 25 ) 26 27 // ArrowWriterProperties are used to determine how to manipulate the arrow data 28 // when writing it to a parquet file. 29 type ArrowWriterProperties struct { 30 mem memory.Allocator 31 timestampAsInt96 bool 32 coerceTimestamps bool 33 coerceTimestampUnit arrow.TimeUnit 34 allowTruncatedTimestamps bool 35 storeSchema bool 36 noMapLogicalType bool 37 // compliantNestedTypes bool 38 } 39 40 // DefaultWriterProps returns the default properties for the arrow writer, 41 // which are to use memory.DefaultAllocator and coerceTimestampUnit: arrow.Second. 42 func DefaultWriterProps() ArrowWriterProperties { 43 return ArrowWriterProperties{ 44 mem: memory.DefaultAllocator, 45 coerceTimestampUnit: arrow.Second, 46 } 47 } 48 49 type config struct { 50 props ArrowWriterProperties 51 } 52 53 // WriterOption is a convenience for building up arrow writer properties 54 type WriterOption func(*config) 55 56 // NewArrowWriterProperties creates a new writer properties object by passing in 57 // a set of options to control the properties. Once created, an individual instance 58 // of ArrowWriterProperties is immutable. 59 func NewArrowWriterProperties(opts ...WriterOption) ArrowWriterProperties { 60 cfg := config{DefaultWriterProps()} 61 for _, o := range opts { 62 o(&cfg) 63 } 64 return cfg.props 65 } 66 67 // WithAllocator specifies the allocator to be used by the writer whenever allocating 68 // buffers and memory. 69 func WithAllocator(mem memory.Allocator) WriterOption { 70 return func(c *config) { 71 c.props.mem = mem 72 } 73 } 74 75 // WithDeprecatedInt96Timestamps allows specifying to enable conversion of arrow timestamps 76 // to int96 columns when constructing the schema. Since int96 is the impala standard, it's 77 // technically deprecated in terms of parquet files but is sometimes needed. 78 func WithDeprecatedInt96Timestamps(enabled bool) WriterOption { 79 return func(c *config) { 80 c.props.timestampAsInt96 = enabled 81 } 82 } 83 84 // WithCoerceTimestamps enables coercing of timestamp units to a specific time unit 85 // when constructing the schema and writing data so that regardless of the unit used 86 // by the datatypes being written, they will be converted to the desired time unit. 87 func WithCoerceTimestamps(unit arrow.TimeUnit) WriterOption { 88 return func(c *config) { 89 c.props.coerceTimestamps = true 90 c.props.coerceTimestampUnit = unit 91 } 92 } 93 94 // WithTruncatedTimestamps called with true turns off the error that would be returned 95 // if coercing a timestamp unit would cause a loss of data such as converting from 96 // nanoseconds to seconds. 97 func WithTruncatedTimestamps(allow bool) WriterOption { 98 return func(c *config) { 99 c.props.allowTruncatedTimestamps = allow 100 } 101 } 102 103 // WithStoreSchema enables writing a binary serialized arrow schema to the file in metadata 104 // to enable certain read options (like "read_dictionary") to be set automatically 105 // 106 // If called, the arrow schema is serialized and base64 encoded before being added to the 107 // metadata of the parquet file with the key "ARROW:schema". If the key exists when 108 // opening a file for read with pqarrow.FileReader, the schema will be used to choose 109 // types and options when constructing the arrow schema of the resulting data. 110 func WithStoreSchema() WriterOption { 111 return func(c *config) { 112 c.props.storeSchema = true 113 } 114 } 115 116 func WithNoMapLogicalType() WriterOption { 117 return func(c *config) { 118 c.props.noMapLogicalType = true 119 } 120 } 121 122 // func WithCompliantNestedTypes(enabled bool) WriterOption { 123 // return func(c *config) { 124 // c.props.compliantNestedTypes = enabled 125 // } 126 // } 127 128 type arrowWriteContext struct { 129 props ArrowWriterProperties 130 dataBuffer *memory.Buffer 131 defLevelsBuffer encoding.Buffer 132 repLevelsBuffer encoding.Buffer 133 } 134 135 type arrowCtxKey struct{} 136 137 // NewArrowWriteContext is for creating a re-usable context object that contains writer properties 138 // and other re-usable buffers for writing. The resulting context should not be used to write 139 // multiple columns concurrently. If nil is passed, then DefaultWriterProps will be used. 140 func NewArrowWriteContext(ctx context.Context, props *ArrowWriterProperties) context.Context { 141 if props == nil { 142 p := DefaultWriterProps() 143 props = &p 144 } 145 return context.WithValue(ctx, arrowCtxKey{}, &arrowWriteContext{props: *props}) 146 } 147 148 func arrowCtxFromContext(ctx context.Context) *arrowWriteContext { 149 awc := ctx.Value(arrowCtxKey{}) 150 if awc != nil { 151 return awc.(*arrowWriteContext) 152 } 153 154 return &arrowWriteContext{ 155 props: DefaultWriterProps(), 156 } 157 } 158 159 // ArrowReadProperties is the properties to define how to read a parquet file 160 // into arrow arrays. 161 type ArrowReadProperties struct { 162 // If Parallel is true, then functions which read multiple columns will read 163 // those columns in parallel from the file with a number of readers equal 164 // to the number of columns. Otherwise columns are read serially. 165 Parallel bool 166 // BatchSize is the size used for calls to NextBatch when reading whole columns 167 BatchSize int64 168 169 readDictIndices map[int]struct{} 170 } 171 172 // SetReadDict determines whether to read a particular column as dictionary 173 // encoded or not. 174 func (props *ArrowReadProperties) SetReadDict(colIdx int, readDict bool) { 175 if props.readDictIndices == nil { 176 props.readDictIndices = make(map[int]struct{}) 177 } 178 179 if readDict { 180 props.readDictIndices[colIdx] = struct{}{} 181 } else { 182 delete(props.readDictIndices, colIdx) 183 } 184 } 185 186 func (props *ArrowReadProperties) ReadDict(colIdx int) bool { 187 if props.readDictIndices == nil { 188 return false 189 } 190 191 _, ok := props.readDictIndices[colIdx] 192 return ok 193 }