github.com/apache/arrow/go/v14@v14.0.2/parquet/writer_properties.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet 18 19 import ( 20 "github.com/apache/arrow/go/v14/arrow/memory" 21 "github.com/apache/arrow/go/v14/parquet/compress" 22 ) 23 24 // Constants for default property values used for the default reader, writer and column props. 25 const ( 26 // Default Buffer size used for the Reader 27 DefaultBufSize int64 = 4096 * 4 28 // Default data page size limit is 1K it's not guaranteed, but we will try to 29 // cut data pages off at this size where possible. 30 DefaultDataPageSize int64 = 1024 * 1024 31 // Default is for dictionary encoding to be turned on, use WithDictionaryDefault 32 // writer property to change that. 33 DefaultDictionaryEnabled = true 34 // If the dictionary reaches the size of this limitation, the writer will use 35 // the fallback encoding (usually plain) instead of continuing to build the 36 // dictionary index. 37 DefaultDictionaryPageSizeLimit = DefaultDataPageSize 38 // In order to attempt to facilitate data page size limits for writing, 39 // data is written in batches. Increasing the batch size may improve performance 40 // but the larger the batch size, the easier it is to overshoot the datapage limit. 41 DefaultWriteBatchSize int64 = 1024 42 // Default maximum number of rows for a single row group 43 DefaultMaxRowGroupLen int64 = 64 * 1024 * 1024 44 // Default is to have stats enabled for all columns, use writer properties to 45 // change the default, or to enable/disable for specific columns. 46 DefaultStatsEnabled = true 47 // If the stats are larger than 4K the writer will skip writing them out anyways. 48 DefaultMaxStatsSize int64 = 4096 49 DefaultCreatedBy = "parquet-go version 14.0.2" 50 DefaultRootName = "schema" 51 ) 52 53 // ColumnProperties defines the encoding, codec, and so on for a given column. 54 type ColumnProperties struct { 55 Encoding Encoding 56 Codec compress.Compression 57 DictionaryEnabled bool 58 StatsEnabled bool 59 MaxStatsSize int64 60 CompressionLevel int 61 } 62 63 // DefaultColumnProperties returns the default properties which get utilized for writing. 64 // 65 // The default column properties are the following constants: 66 // 67 // Encoding: Encodings.Plain 68 // Codec: compress.Codecs.Uncompressed 69 // DictionaryEnabled: DefaultDictionaryEnabled 70 // StatsEnabled: DefaultStatsEnabled 71 // MaxStatsSize: DefaultMaxStatsSize 72 // CompressionLevel: compress.DefaultCompressionLevel 73 func DefaultColumnProperties() ColumnProperties { 74 return ColumnProperties{ 75 Encoding: Encodings.Plain, 76 Codec: compress.Codecs.Uncompressed, 77 DictionaryEnabled: DefaultDictionaryEnabled, 78 StatsEnabled: DefaultStatsEnabled, 79 MaxStatsSize: DefaultMaxStatsSize, 80 CompressionLevel: compress.DefaultCompressionLevel, 81 } 82 } 83 84 type writerPropConfig struct { 85 wr *WriterProperties 86 encodings map[string]Encoding 87 codecs map[string]compress.Compression 88 compressLevel map[string]int 89 dictEnabled map[string]bool 90 statsEnabled map[string]bool 91 } 92 93 // WriterProperty is used as the options for building a writer properties instance 94 type WriterProperty func(*writerPropConfig) 95 96 // WithAllocator specifies the writer to use the given allocator 97 func WithAllocator(mem memory.Allocator) WriterProperty { 98 return func(cfg *writerPropConfig) { 99 cfg.wr.mem = mem 100 } 101 } 102 103 // WithDictionaryDefault sets the default value for whether to enable dictionary encoding 104 func WithDictionaryDefault(dict bool) WriterProperty { 105 return func(cfg *writerPropConfig) { 106 cfg.wr.defColumnProps.DictionaryEnabled = dict 107 } 108 } 109 110 // WithDictionaryFor allows enabling or disabling dictionary encoding for a given column path string 111 func WithDictionaryFor(path string, dict bool) WriterProperty { 112 return func(cfg *writerPropConfig) { 113 cfg.dictEnabled[path] = dict 114 } 115 } 116 117 // WithDictionaryPath is like WithDictionaryFor, but takes a ColumnPath type 118 func WithDictionaryPath(path ColumnPath, dict bool) WriterProperty { 119 return WithDictionaryFor(path.String(), dict) 120 } 121 122 // WithDictionaryPageSizeLimit is the limit of the dictionary at which the writer 123 // will fallback to plain encoding instead 124 func WithDictionaryPageSizeLimit(limit int64) WriterProperty { 125 return func(cfg *writerPropConfig) { 126 cfg.wr.dictPagesize = limit 127 } 128 } 129 130 // WithBatchSize specifies the number of rows to use for batch writes to columns 131 func WithBatchSize(batch int64) WriterProperty { 132 return func(cfg *writerPropConfig) { 133 cfg.wr.batchSize = batch 134 } 135 } 136 137 // WithMaxRowGroupLength specifies the number of rows as the maximum number of rows for a given row group in the writer. 138 func WithMaxRowGroupLength(nrows int64) WriterProperty { 139 return func(cfg *writerPropConfig) { 140 cfg.wr.maxRowGroupLen = nrows 141 } 142 } 143 144 // WithDataPageSize specifies the size to use for splitting data pages for column writing. 145 func WithDataPageSize(pgsize int64) WriterProperty { 146 return func(cfg *writerPropConfig) { 147 cfg.wr.pageSize = pgsize 148 } 149 } 150 151 // WithDataPageVersion specifies whether to use Version 1 or Version 2 of the DataPage spec 152 func WithDataPageVersion(version DataPageVersion) WriterProperty { 153 return func(cfg *writerPropConfig) { 154 cfg.wr.dataPageVersion = version 155 } 156 } 157 158 // WithVersion specifies which Parquet Spec version to utilize for writing. 159 func WithVersion(version Version) WriterProperty { 160 return func(cfg *writerPropConfig) { 161 cfg.wr.parquetVersion = version 162 } 163 } 164 165 // WithCreatedBy specifies the "created by" string to use for the writer 166 func WithCreatedBy(createdby string) WriterProperty { 167 return func(cfg *writerPropConfig) { 168 cfg.wr.createdBy = createdby 169 } 170 } 171 172 // WithRootName enables customization of the name used for the root schema node. This is required 173 // to maintain compatibility with other tools. 174 func WithRootName(name string) WriterProperty { 175 return func(cfg *writerPropConfig) { 176 cfg.wr.rootName = name 177 } 178 } 179 180 // WithRootRepetition enables customization of the repetition used for the root schema node. 181 // This is required to maintain compatibility with other tools. 182 func WithRootRepetition(repetition Repetition) WriterProperty { 183 return func(cfg *writerPropConfig) { 184 cfg.wr.rootRepetition = repetition 185 } 186 } 187 188 // WithEncoding defines the encoding that is used when we aren't using dictionary encoding. 189 // 190 // This is either applied if dictionary encoding is disabled, or if we fallback if the dictionary 191 // grew too large. 192 func WithEncoding(encoding Encoding) WriterProperty { 193 return func(cfg *writerPropConfig) { 194 if encoding == Encodings.PlainDict || encoding == Encodings.RLEDict { 195 panic("parquet: can't use dictionary encoding as fallback encoding") 196 } 197 cfg.wr.defColumnProps.Encoding = encoding 198 } 199 } 200 201 // WithEncodingFor is for defining the encoding only for a specific column path. This encoding will be used 202 // if dictionary encoding is disabled for the column or if we fallback because the dictionary grew too large 203 func WithEncodingFor(path string, encoding Encoding) WriterProperty { 204 return func(cfg *writerPropConfig) { 205 if encoding == Encodings.PlainDict || encoding == Encodings.RLEDict { 206 panic("parquet: can't use dictionary encoding as fallback encoding") 207 } 208 cfg.encodings[path] = encoding 209 } 210 } 211 212 // WithEncodingPath is the same as WithEncodingFor but takes a ColumnPath directly. 213 func WithEncodingPath(path ColumnPath, encoding Encoding) WriterProperty { 214 return WithEncodingFor(path.String(), encoding) 215 } 216 217 // WithCompression specifies the default compression type to use for column writing. 218 func WithCompression(codec compress.Compression) WriterProperty { 219 return func(cfg *writerPropConfig) { 220 cfg.wr.defColumnProps.Codec = codec 221 } 222 } 223 224 // WithCompressionFor specifies the compression type for the given column. 225 func WithCompressionFor(path string, codec compress.Compression) WriterProperty { 226 return func(cfg *writerPropConfig) { 227 cfg.codecs[path] = codec 228 } 229 } 230 231 // WithCompressionPath is the same as WithCompressionFor but takes a ColumnPath directly. 232 func WithCompressionPath(path ColumnPath, codec compress.Compression) WriterProperty { 233 return WithCompressionFor(path.String(), codec) 234 } 235 236 // WithMaxStatsSize sets a maximum size for the statistics before we decide not to include them. 237 func WithMaxStatsSize(maxStatsSize int64) WriterProperty { 238 return func(cfg *writerPropConfig) { 239 cfg.wr.defColumnProps.MaxStatsSize = maxStatsSize 240 } 241 } 242 243 // WithCompressionLevel specifies the default compression level for the compressor in every column. 244 // 245 // The provided compression level is compressor specific. The user would have to know what the available 246 // levels are for the selected compressor. If the compressor does not allow for selecting different 247 // compression levels, then this function will have no effect. Parquet and Arrow will not validate the 248 // passed compression level. If no level is selected by the user or if the special compress.DefaultCompressionLevel 249 // value is used, then parquet will select the compression level. 250 func WithCompressionLevel(level int) WriterProperty { 251 return func(cfg *writerPropConfig) { 252 cfg.wr.defColumnProps.CompressionLevel = level 253 } 254 } 255 256 // WithCompressionLevelFor is like WithCompressionLevel but only for the given column path. 257 func WithCompressionLevelFor(path string, level int) WriterProperty { 258 return func(cfg *writerPropConfig) { 259 cfg.compressLevel[path] = level 260 } 261 } 262 263 // WithCompressionLevelPath is the same as WithCompressionLevelFor but takes a ColumnPath 264 func WithCompressionLevelPath(path ColumnPath, level int) WriterProperty { 265 return WithCompressionLevelFor(path.String(), level) 266 } 267 268 // WithStats specifies a default for whether or not to enable column statistics. 269 func WithStats(enabled bool) WriterProperty { 270 return func(cfg *writerPropConfig) { 271 cfg.wr.defColumnProps.StatsEnabled = enabled 272 } 273 } 274 275 // WithStatsFor specifies a per column value as to enable or disable statistics in the resulting file. 276 func WithStatsFor(path string, enabled bool) WriterProperty { 277 return func(cfg *writerPropConfig) { 278 cfg.statsEnabled[path] = enabled 279 } 280 } 281 282 // WithStatsPath is the same as WithStatsFor but takes a ColumnPath 283 func WithStatsPath(path ColumnPath, enabled bool) WriterProperty { 284 return WithStatsFor(path.String(), enabled) 285 } 286 287 // WithEncryptionProperties specifies the file level encryption handling for writing the file. 288 func WithEncryptionProperties(props *FileEncryptionProperties) WriterProperty { 289 return func(cfg *writerPropConfig) { 290 cfg.wr.encryptionProps = props 291 } 292 } 293 294 // WithStoreDecimalAsInteger specifies whether to try using an int32/int64 for storing 295 // decimal data rather than fixed len byte arrays if the precision is low enough. 296 func WithStoreDecimalAsInteger(enabled bool) WriterProperty { 297 return func(cfg *writerPropConfig) { 298 cfg.wr.storeDecimalAsInt = enabled 299 } 300 } 301 302 // WriterProperties is the collection of properties to use for writing a parquet file. The values are 303 // read only once it has been constructed. 304 type WriterProperties struct { 305 mem memory.Allocator 306 dictPagesize int64 307 batchSize int64 308 maxRowGroupLen int64 309 pageSize int64 310 parquetVersion Version 311 createdBy string 312 dataPageVersion DataPageVersion 313 rootName string 314 rootRepetition Repetition 315 storeDecimalAsInt bool 316 317 defColumnProps ColumnProperties 318 columnProps map[string]*ColumnProperties 319 encryptionProps *FileEncryptionProperties 320 } 321 322 func defaultWriterProperties() *WriterProperties { 323 return &WriterProperties{ 324 mem: memory.DefaultAllocator, 325 dictPagesize: DefaultDictionaryPageSizeLimit, 326 batchSize: DefaultWriteBatchSize, 327 maxRowGroupLen: DefaultMaxRowGroupLen, 328 pageSize: DefaultDataPageSize, 329 parquetVersion: V2_LATEST, 330 dataPageVersion: DataPageV1, 331 createdBy: DefaultCreatedBy, 332 rootName: DefaultRootName, 333 rootRepetition: Repetitions.Repeated, 334 defColumnProps: DefaultColumnProperties(), 335 } 336 } 337 338 // NewWriterProperties takes a list of options for building the properties. If multiple options are used which conflict 339 // then the last option is the one which will take effect. If no WriterProperty options are provided, then the default 340 // properties will be utilized for writing. 341 // 342 // The Default properties use the following constants: 343 // 344 // Allocator: memory.DefaultAllocator 345 // DictionaryPageSize: DefaultDictionaryPageSizeLimit 346 // BatchSize: DefaultWriteBatchSize 347 // MaxRowGroupLength: DefaultMaxRowGroupLen 348 // PageSize: DefaultDataPageSize 349 // ParquetVersion: V1 350 // DataPageVersion: DataPageV1 351 // CreatedBy: DefaultCreatedBy 352 func NewWriterProperties(opts ...WriterProperty) *WriterProperties { 353 cfg := writerPropConfig{ 354 wr: defaultWriterProperties(), 355 encodings: make(map[string]Encoding), 356 codecs: make(map[string]compress.Compression), 357 compressLevel: make(map[string]int), 358 dictEnabled: make(map[string]bool), 359 statsEnabled: make(map[string]bool), 360 } 361 for _, o := range opts { 362 o(&cfg) 363 } 364 365 cfg.wr.columnProps = make(map[string]*ColumnProperties) 366 get := func(key string) *ColumnProperties { 367 if p, ok := cfg.wr.columnProps[key]; ok { 368 return p 369 } 370 cfg.wr.columnProps[key] = new(ColumnProperties) 371 *cfg.wr.columnProps[key] = cfg.wr.defColumnProps 372 return cfg.wr.columnProps[key] 373 } 374 375 for key, value := range cfg.encodings { 376 get(key).Encoding = value 377 } 378 379 for key, value := range cfg.codecs { 380 get(key).Codec = value 381 } 382 383 for key, value := range cfg.compressLevel { 384 get(key).CompressionLevel = value 385 } 386 387 for key, value := range cfg.dictEnabled { 388 get(key).DictionaryEnabled = value 389 } 390 391 for key, value := range cfg.statsEnabled { 392 get(key).StatsEnabled = value 393 } 394 return cfg.wr 395 } 396 397 // FileEncryptionProperties returns the current encryption properties that were 398 // used to create the writer properties. 399 func (w *WriterProperties) FileEncryptionProperties() *FileEncryptionProperties { 400 return w.encryptionProps 401 } 402 403 func (w *WriterProperties) Allocator() memory.Allocator { return w.mem } 404 func (w *WriterProperties) CreatedBy() string { return w.createdBy } 405 func (w *WriterProperties) RootName() string { return w.rootName } 406 func (w *WriterProperties) RootRepetition() Repetition { return w.rootRepetition } 407 func (w *WriterProperties) WriteBatchSize() int64 { return w.batchSize } 408 func (w *WriterProperties) DataPageSize() int64 { return w.pageSize } 409 func (w *WriterProperties) DictionaryPageSizeLimit() int64 { return w.dictPagesize } 410 func (w *WriterProperties) Version() Version { return w.parquetVersion } 411 func (w *WriterProperties) DataPageVersion() DataPageVersion { return w.dataPageVersion } 412 func (w *WriterProperties) MaxRowGroupLength() int64 { return w.maxRowGroupLen } 413 414 // Compression returns the default compression type that will be used for any columns that don't 415 // have a specific compression defined. 416 func (w *WriterProperties) Compression() compress.Compression { return w.defColumnProps.Codec } 417 418 // CompressionFor will return the compression type that is specified for the given column path, or 419 // the default compression codec if there isn't one specific to this column. 420 func (w *WriterProperties) CompressionFor(path string) compress.Compression { 421 if p, ok := w.columnProps[path]; ok { 422 return p.Codec 423 } 424 return w.defColumnProps.Codec 425 } 426 427 // CompressionPath is the same as CompressionFor but takes a ColumnPath 428 func (w *WriterProperties) CompressionPath(path ColumnPath) compress.Compression { 429 return w.CompressionFor(path.String()) 430 } 431 432 // CompressionLevel returns the default compression level that will be used for any column 433 // that doesn't have a compression level specified for it. 434 func (w *WriterProperties) CompressionLevel() int { return w.defColumnProps.CompressionLevel } 435 436 // CompressionLevelFor returns the compression level that will be utilized for the given column, 437 // or the default compression level if the column doesn't have a specific level specified. 438 func (w *WriterProperties) CompressionLevelFor(path string) int { 439 if p, ok := w.columnProps[path]; ok { 440 return p.CompressionLevel 441 } 442 return w.defColumnProps.CompressionLevel 443 } 444 445 // CompressionLevelPath is the same as CompressionLevelFor but takes a ColumnPath object 446 func (w *WriterProperties) CompressionLevelPath(path ColumnPath) int { 447 return w.CompressionLevelFor(path.String()) 448 } 449 450 // Encoding returns the default encoding that will be utilized for any columns which don't have a different value 451 // specified. 452 func (w *WriterProperties) Encoding() Encoding { return w.defColumnProps.Encoding } 453 454 // EncodingFor returns the encoding that will be used for the given column path, or the default encoding if there 455 // isn't one specified for this column. 456 func (w *WriterProperties) EncodingFor(path string) Encoding { 457 if p, ok := w.columnProps[path]; ok { 458 return p.Encoding 459 } 460 return w.defColumnProps.Encoding 461 } 462 463 // EncodingPath is the same as EncodingFor but takes a ColumnPath object 464 func (w *WriterProperties) EncodingPath(path ColumnPath) Encoding { 465 return w.EncodingFor(path.String()) 466 } 467 468 // DictionaryIndexEncoding returns which encoding will be used for the Dictionary Index values based on the 469 // parquet version. V1 uses PlainDict and V2 uses RLEDict 470 func (w *WriterProperties) DictionaryIndexEncoding() Encoding { 471 if w.parquetVersion == V1_0 { 472 return Encodings.PlainDict 473 } 474 return Encodings.RLEDict 475 } 476 477 // DictionaryPageEncoding returns the encoding that will be utilized for the DictionaryPage itself based on the parquet 478 // version. V1 uses PlainDict, v2 uses Plain 479 func (w *WriterProperties) DictionaryPageEncoding() Encoding { 480 if w.parquetVersion == V1_0 { 481 return Encodings.PlainDict 482 } 483 return Encodings.Plain 484 } 485 486 // DictionaryEnabled returns the default value as for whether or not dictionary encoding will be utilized for columns 487 // that aren't separately specified. 488 func (w *WriterProperties) DictionaryEnabled() bool { return w.defColumnProps.DictionaryEnabled } 489 490 // DictionaryEnabledFor returns whether or not dictionary encoding will be used for the specified column when writing 491 // or the default value if the column was not separately specified. 492 func (w *WriterProperties) DictionaryEnabledFor(path string) bool { 493 if p, ok := w.columnProps[path]; ok { 494 return p.DictionaryEnabled 495 } 496 return w.defColumnProps.DictionaryEnabled 497 } 498 499 // DictionaryEnabledPath is the same as DictionaryEnabledFor but takes a ColumnPath object. 500 func (w *WriterProperties) DictionaryEnabledPath(path ColumnPath) bool { 501 return w.DictionaryEnabledFor(path.String()) 502 } 503 504 // StatisticsEnabled returns the default value for whether or not stats are enabled to be written for columns 505 // that aren't separately specified. 506 func (w *WriterProperties) StatisticsEnabled() bool { return w.defColumnProps.StatsEnabled } 507 508 // StatisticsEnabledFor returns whether stats will be written for the given column path, or the default value if 509 // it wasn't separately specified. 510 func (w *WriterProperties) StatisticsEnabledFor(path string) bool { 511 if p, ok := w.columnProps[path]; ok { 512 return p.StatsEnabled 513 } 514 return w.defColumnProps.StatsEnabled 515 } 516 517 // StatisticsEnabledPath is the same as StatisticsEnabledFor but takes a ColumnPath object. 518 func (w *WriterProperties) StatisticsEnabledPath(path ColumnPath) bool { 519 return w.StatisticsEnabledFor(path.String()) 520 } 521 522 // MaxStatsSize returns the default maximum size for stats 523 func (w *WriterProperties) MaxStatsSize() int64 { return w.defColumnProps.MaxStatsSize } 524 525 // MaxStatsSizeFor returns the maximum stat size for the given column path 526 func (w *WriterProperties) MaxStatsSizeFor(path string) int64 { 527 if p, ok := w.columnProps[path]; ok { 528 return p.MaxStatsSize 529 } 530 return w.defColumnProps.MaxStatsSize 531 } 532 533 // MaxStatsSizePath is the same as MaxStatsSizeFor but takes a ColumnPath 534 func (w *WriterProperties) MaxStatsSizePath(path ColumnPath) int64 { 535 return w.MaxStatsSizeFor(path.String()) 536 } 537 538 // ColumnEncryptionProperties returns the specific properties for encryption that will be used for the given column path 539 func (w *WriterProperties) ColumnEncryptionProperties(path string) *ColumnEncryptionProperties { 540 if w.encryptionProps != nil { 541 return w.encryptionProps.ColumnEncryptionProperties(path) 542 } 543 return nil 544 } 545 546 // StoreDecimalAsInteger returns the config option controlling whether or not 547 // to try storing decimal data as an integer type if the precision is low enough 548 // (1 <= prec <= 18 can be stored as an int), otherwise it will be stored as 549 // a fixed len byte array. 550 func (w *WriterProperties) StoreDecimalAsInteger() bool { 551 return w.storeDecimalAsInt 552 }