github.com/apache/arrow/go/v10@v10.0.1/parquet/writer_properties.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package parquet 18 19 import ( 20 "github.com/apache/arrow/go/v10/arrow/memory" 21 "github.com/apache/arrow/go/v10/parquet/compress" 22 ) 23 24 // Constants for default property values used for the default reader, writer and column props. 25 const ( 26 // Default Buffer size used for the Reader 27 DefaultBufSize int64 = 4096 * 4 28 // Default data page size limit is 1K it's not guaranteed, but we will try to 29 // cut data pages off at this size where possible. 30 DefaultDataPageSize int64 = 1024 * 1024 31 // Default is for dictionary encoding to be turned on, use WithDictionaryDefault 32 // writer property to change that. 33 DefaultDictionaryEnabled = true 34 // If the dictionary reaches the size of this limitation, the writer will use 35 // the fallback encoding (usually plain) instead of continuing to build the 36 // dictionary index. 37 DefaultDictionaryPageSizeLimit = DefaultDataPageSize 38 // In order to attempt to facilitate data page size limits for writing, 39 // data is written in batches. Increasing the batch size may improve performance 40 // but the larger the batch size, the easier it is to overshoot the datapage limit. 41 DefaultWriteBatchSize int64 = 1024 42 // Default maximum number of rows for a single row group 43 DefaultMaxRowGroupLen int64 = 64 * 1024 * 1024 44 // Default is to have stats enabled for all columns, use writer properties to 45 // change the default, or to enable/disable for specific columns. 46 DefaultStatsEnabled = true 47 // If the stats are larger than 4K the writer will skip writing them out anyways. 48 DefaultMaxStatsSize int64 = 4096 49 DefaultCreatedBy = "parquet-go version 10.0.1" 50 DefaultRootName = "schema" 51 ) 52 53 // ColumnProperties defines the encoding, codec, and so on for a given column. 54 type ColumnProperties struct { 55 Encoding Encoding 56 Codec compress.Compression 57 DictionaryEnabled bool 58 StatsEnabled bool 59 MaxStatsSize int64 60 CompressionLevel int 61 } 62 63 // DefaultColumnProperties returns the default properties which get utilized for writing. 64 // 65 // The default column properties are the following constants: 66 // Encoding: Encodings.Plain 67 // Codec: compress.Codecs.Uncompressed 68 // DictionaryEnabled: DefaultDictionaryEnabled 69 // StatsEnabled: DefaultStatsEnabled 70 // MaxStatsSize: DefaultMaxStatsSize 71 // CompressionLevel: compress.DefaultCompressionLevel 72 func DefaultColumnProperties() ColumnProperties { 73 return ColumnProperties{ 74 Encoding: Encodings.Plain, 75 Codec: compress.Codecs.Uncompressed, 76 DictionaryEnabled: DefaultDictionaryEnabled, 77 StatsEnabled: DefaultStatsEnabled, 78 MaxStatsSize: DefaultMaxStatsSize, 79 CompressionLevel: compress.DefaultCompressionLevel, 80 } 81 } 82 83 type writerPropConfig struct { 84 wr *WriterProperties 85 encodings map[string]Encoding 86 codecs map[string]compress.Compression 87 compressLevel map[string]int 88 dictEnabled map[string]bool 89 statsEnabled map[string]bool 90 } 91 92 // WriterProperty is used as the options for building a writer properties instance 93 type WriterProperty func(*writerPropConfig) 94 95 // WithAllocator specifies the writer to use the given allocator 96 func WithAllocator(mem memory.Allocator) WriterProperty { 97 return func(cfg *writerPropConfig) { 98 cfg.wr.mem = mem 99 } 100 } 101 102 // WithDictionaryDefault sets the default value for whether to enable dictionary encoding 103 func WithDictionaryDefault(dict bool) WriterProperty { 104 return func(cfg *writerPropConfig) { 105 cfg.wr.defColumnProps.DictionaryEnabled = dict 106 } 107 } 108 109 // WithDictionaryFor allows enabling or disabling dictionary encoding for a given column path string 110 func WithDictionaryFor(path string, dict bool) WriterProperty { 111 return func(cfg *writerPropConfig) { 112 cfg.dictEnabled[path] = dict 113 } 114 } 115 116 // WithDictionaryPath is like WithDictionaryFor, but takes a ColumnPath type 117 func WithDictionaryPath(path ColumnPath, dict bool) WriterProperty { 118 return WithDictionaryFor(path.String(), dict) 119 } 120 121 // WithDictionaryPageSizeLimit is the limit of the dictionary at which the writer 122 // will fallback to plain encoding instead 123 func WithDictionaryPageSizeLimit(limit int64) WriterProperty { 124 return func(cfg *writerPropConfig) { 125 cfg.wr.dictPagesize = limit 126 } 127 } 128 129 // WithBatchSize specifies the number of rows to use for batch writes to columns 130 func WithBatchSize(batch int64) WriterProperty { 131 return func(cfg *writerPropConfig) { 132 cfg.wr.batchSize = batch 133 } 134 } 135 136 // WithMaxRowGroupLength specifies the number of rows as the maximum number of rows for a given row group in the writer. 137 func WithMaxRowGroupLength(nrows int64) WriterProperty { 138 return func(cfg *writerPropConfig) { 139 cfg.wr.maxRowGroupLen = nrows 140 } 141 } 142 143 // WithDataPageSize specifies the size to use for splitting data pages for column writing. 144 func WithDataPageSize(pgsize int64) WriterProperty { 145 return func(cfg *writerPropConfig) { 146 cfg.wr.pageSize = pgsize 147 } 148 } 149 150 // WithDataPageVersion specifies whether to use Version 1 or Version 2 of the DataPage spec 151 func WithDataPageVersion(version DataPageVersion) WriterProperty { 152 return func(cfg *writerPropConfig) { 153 cfg.wr.dataPageVersion = version 154 } 155 } 156 157 // WithVersion specifies which Parquet Spec version to utilize for writing. 158 func WithVersion(version Version) WriterProperty { 159 return func(cfg *writerPropConfig) { 160 cfg.wr.parquetVersion = version 161 } 162 } 163 164 // WithCreatedBy specifies the "created by" string to use for the writer 165 func WithCreatedBy(createdby string) WriterProperty { 166 return func(cfg *writerPropConfig) { 167 cfg.wr.createdBy = createdby 168 } 169 } 170 171 // WithRootName enables customization of the name used for the root schema node. This is required 172 // to maintain compatibility with other tools. 173 func WithRootName(name string) WriterProperty { 174 return func(cfg *writerPropConfig) { 175 cfg.wr.rootName = name 176 } 177 } 178 179 // WithRootRepetition enables customization of the repetition used for the root schema node. 180 // This is required to maintain compatibility with other tools. 181 func WithRootRepetition(repetition Repetition) WriterProperty { 182 return func(cfg *writerPropConfig) { 183 cfg.wr.rootRepetition = repetition 184 } 185 } 186 187 // WithEncoding defines the encoding that is used when we aren't using dictionary encoding. 188 // 189 // This is either applied if dictionary encoding is disabled, or if we fallback if the dictionary 190 // grew too large. 191 func WithEncoding(encoding Encoding) WriterProperty { 192 return func(cfg *writerPropConfig) { 193 if encoding == Encodings.PlainDict || encoding == Encodings.RLEDict { 194 panic("parquet: can't use dictionary encoding as fallback encoding") 195 } 196 cfg.wr.defColumnProps.Encoding = encoding 197 } 198 } 199 200 // WithEncodingFor is for defining the encoding only for a specific column path. This encoding will be used 201 // if dictionary encoding is disabled for the column or if we fallback because the dictionary grew too large 202 func WithEncodingFor(path string, encoding Encoding) WriterProperty { 203 return func(cfg *writerPropConfig) { 204 if encoding == Encodings.PlainDict || encoding == Encodings.RLEDict { 205 panic("parquet: can't use dictionary encoding as fallback encoding") 206 } 207 cfg.encodings[path] = encoding 208 } 209 } 210 211 // WithEncodingPath is the same as WithEncodingFor but takes a ColumnPath directly. 212 func WithEncodingPath(path ColumnPath, encoding Encoding) WriterProperty { 213 return WithEncodingFor(path.String(), encoding) 214 } 215 216 // WithCompression specifies the default compression type to use for column writing. 217 func WithCompression(codec compress.Compression) WriterProperty { 218 return func(cfg *writerPropConfig) { 219 cfg.wr.defColumnProps.Codec = codec 220 } 221 } 222 223 // WithCompressionFor specifies the compression type for the given column. 224 func WithCompressionFor(path string, codec compress.Compression) WriterProperty { 225 return func(cfg *writerPropConfig) { 226 cfg.codecs[path] = codec 227 } 228 } 229 230 // WithCompressionPath is the same as WithCompressionFor but takes a ColumnPath directly. 231 func WithCompressionPath(path ColumnPath, codec compress.Compression) WriterProperty { 232 return WithCompressionFor(path.String(), codec) 233 } 234 235 // WithMaxStatsSize sets a maximum size for the statistics before we decide not to include them. 236 func WithMaxStatsSize(maxStatsSize int64) WriterProperty { 237 return func(cfg *writerPropConfig) { 238 cfg.wr.defColumnProps.MaxStatsSize = maxStatsSize 239 } 240 } 241 242 // WithCompressionLevel specifies the default compression level for the compressor in every column. 243 // 244 // The provided compression level is compressor specific. The user would have to know what the available 245 // levels are for the selected compressor. If the compressor does not allow for selecting different 246 // compression levels, then this function will have no effect. Parquet and Arrow will not validate the 247 // passed compression level. If no level is selected by the user or if the special compress.DefaultCompressionLevel 248 // value is used, then parquet will select the compression level. 249 func WithCompressionLevel(level int) WriterProperty { 250 return func(cfg *writerPropConfig) { 251 cfg.wr.defColumnProps.CompressionLevel = level 252 } 253 } 254 255 // WithCompressionLevelFor is like WithCompressionLevel but only for the given column path. 256 func WithCompressionLevelFor(path string, level int) WriterProperty { 257 return func(cfg *writerPropConfig) { 258 cfg.compressLevel[path] = level 259 } 260 } 261 262 // WithCompressionLevelPath is the same as WithCompressionLevelFor but takes a ColumnPath 263 func WithCompressionLevelPath(path ColumnPath, level int) WriterProperty { 264 return WithCompressionLevelFor(path.String(), level) 265 } 266 267 // WithStats specifies a default for whether or not to enable column statistics. 268 func WithStats(enabled bool) WriterProperty { 269 return func(cfg *writerPropConfig) { 270 cfg.wr.defColumnProps.StatsEnabled = enabled 271 } 272 } 273 274 // WithStatsFor specifies a per column value as to enable or disable statistics in the resulting file. 275 func WithStatsFor(path string, enabled bool) WriterProperty { 276 return func(cfg *writerPropConfig) { 277 cfg.statsEnabled[path] = enabled 278 } 279 } 280 281 // WithStatsPath is the same as WithStatsFor but takes a ColumnPath 282 func WithStatsPath(path ColumnPath, enabled bool) WriterProperty { 283 return WithStatsFor(path.String(), enabled) 284 } 285 286 // WithEncryptionProperties specifies the file level encryption handling for writing the file. 287 func WithEncryptionProperties(props *FileEncryptionProperties) WriterProperty { 288 return func(cfg *writerPropConfig) { 289 cfg.wr.encryptionProps = props 290 } 291 } 292 293 // WriterProperties is the collection of properties to use for writing a parquet file. The values are 294 // read only once it has been constructed. 295 type WriterProperties struct { 296 mem memory.Allocator 297 dictPagesize int64 298 batchSize int64 299 maxRowGroupLen int64 300 pageSize int64 301 parquetVersion Version 302 createdBy string 303 dataPageVersion DataPageVersion 304 rootName string 305 rootRepetition Repetition 306 307 defColumnProps ColumnProperties 308 columnProps map[string]*ColumnProperties 309 encryptionProps *FileEncryptionProperties 310 } 311 312 func defaultWriterProperties() *WriterProperties { 313 return &WriterProperties{ 314 mem: memory.DefaultAllocator, 315 dictPagesize: DefaultDictionaryPageSizeLimit, 316 batchSize: DefaultWriteBatchSize, 317 maxRowGroupLen: DefaultMaxRowGroupLen, 318 pageSize: DefaultDataPageSize, 319 parquetVersion: V2_LATEST, 320 dataPageVersion: DataPageV1, 321 createdBy: DefaultCreatedBy, 322 rootName: DefaultRootName, 323 rootRepetition: Repetitions.Repeated, 324 defColumnProps: DefaultColumnProperties(), 325 } 326 } 327 328 // NewWriterProperties takes a list of options for building the properties. If multiple options are used which conflict 329 // then the last option is the one which will take effect. If no WriterProperty options are provided, then the default 330 // properties will be utilized for writing. 331 // 332 // The Default properties use the following constants: 333 // Allocator: memory.DefaultAllocator 334 // DictionaryPageSize: DefaultDictionaryPageSizeLimit 335 // BatchSize: DefaultWriteBatchSize 336 // MaxRowGroupLength: DefaultMaxRowGroupLen 337 // PageSize: DefaultDataPageSize 338 // ParquetVersion: V1 339 // DataPageVersion: DataPageV1 340 // CreatedBy: DefaultCreatedBy 341 func NewWriterProperties(opts ...WriterProperty) *WriterProperties { 342 cfg := writerPropConfig{ 343 wr: defaultWriterProperties(), 344 encodings: make(map[string]Encoding), 345 codecs: make(map[string]compress.Compression), 346 compressLevel: make(map[string]int), 347 dictEnabled: make(map[string]bool), 348 statsEnabled: make(map[string]bool), 349 } 350 for _, o := range opts { 351 o(&cfg) 352 } 353 354 cfg.wr.columnProps = make(map[string]*ColumnProperties) 355 get := func(key string) *ColumnProperties { 356 if p, ok := cfg.wr.columnProps[key]; ok { 357 return p 358 } 359 cfg.wr.columnProps[key] = new(ColumnProperties) 360 *cfg.wr.columnProps[key] = cfg.wr.defColumnProps 361 return cfg.wr.columnProps[key] 362 } 363 364 for key, value := range cfg.encodings { 365 get(key).Encoding = value 366 } 367 368 for key, value := range cfg.codecs { 369 get(key).Codec = value 370 } 371 372 for key, value := range cfg.compressLevel { 373 get(key).CompressionLevel = value 374 } 375 376 for key, value := range cfg.dictEnabled { 377 get(key).DictionaryEnabled = value 378 } 379 380 for key, value := range cfg.statsEnabled { 381 get(key).StatsEnabled = value 382 } 383 return cfg.wr 384 } 385 386 // FileEncryptionProperties returns the current encryption properties that were 387 // used to create the writer properties. 388 func (w *WriterProperties) FileEncryptionProperties() *FileEncryptionProperties { 389 return w.encryptionProps 390 } 391 392 func (w *WriterProperties) Allocator() memory.Allocator { return w.mem } 393 func (w *WriterProperties) CreatedBy() string { return w.createdBy } 394 func (w *WriterProperties) RootName() string { return w.rootName } 395 func (w *WriterProperties) RootRepetition() Repetition { return w.rootRepetition } 396 func (w *WriterProperties) WriteBatchSize() int64 { return w.batchSize } 397 func (w *WriterProperties) DataPageSize() int64 { return w.pageSize } 398 func (w *WriterProperties) DictionaryPageSizeLimit() int64 { return w.dictPagesize } 399 func (w *WriterProperties) Version() Version { return w.parquetVersion } 400 func (w *WriterProperties) DataPageVersion() DataPageVersion { return w.dataPageVersion } 401 func (w *WriterProperties) MaxRowGroupLength() int64 { return w.maxRowGroupLen } 402 403 // Compression returns the default compression type that will be used for any columns that don't 404 // have a specific compression defined. 405 func (w *WriterProperties) Compression() compress.Compression { return w.defColumnProps.Codec } 406 407 // CompressionFor will return the compression type that is specified for the given column path, or 408 // the default compression codec if there isn't one specific to this column. 409 func (w *WriterProperties) CompressionFor(path string) compress.Compression { 410 if p, ok := w.columnProps[path]; ok { 411 return p.Codec 412 } 413 return w.defColumnProps.Codec 414 } 415 416 //CompressionPath is the same as CompressionFor but takes a ColumnPath 417 func (w *WriterProperties) CompressionPath(path ColumnPath) compress.Compression { 418 return w.CompressionFor(path.String()) 419 } 420 421 // CompressionLevel returns the default compression level that will be used for any column 422 // that doesn't have a compression level specified for it. 423 func (w *WriterProperties) CompressionLevel() int { return w.defColumnProps.CompressionLevel } 424 425 // CompressionLevelFor returns the compression level that will be utilized for the given column, 426 // or the default compression level if the column doesn't have a specific level specified. 427 func (w *WriterProperties) CompressionLevelFor(path string) int { 428 if p, ok := w.columnProps[path]; ok { 429 return p.CompressionLevel 430 } 431 return w.defColumnProps.CompressionLevel 432 } 433 434 // CompressionLevelPath is the same as CompressionLevelFor but takes a ColumnPath object 435 func (w *WriterProperties) CompressionLevelPath(path ColumnPath) int { 436 return w.CompressionLevelFor(path.String()) 437 } 438 439 // Encoding returns the default encoding that will be utilized for any columns which don't have a different value 440 // specified. 441 func (w *WriterProperties) Encoding() Encoding { return w.defColumnProps.Encoding } 442 443 // EncodingFor returns the encoding that will be used for the given column path, or the default encoding if there 444 // isn't one specified for this column. 445 func (w *WriterProperties) EncodingFor(path string) Encoding { 446 if p, ok := w.columnProps[path]; ok { 447 return p.Encoding 448 } 449 return w.defColumnProps.Encoding 450 } 451 452 // EncodingPath is the same as EncodingFor but takes a ColumnPath object 453 func (w *WriterProperties) EncodingPath(path ColumnPath) Encoding { 454 return w.EncodingFor(path.String()) 455 } 456 457 // DictionaryIndexEncoding returns which encoding will be used for the Dictionary Index values based on the 458 // parquet version. V1 uses PlainDict and V2 uses RLEDict 459 func (w *WriterProperties) DictionaryIndexEncoding() Encoding { 460 if w.parquetVersion == V1_0 { 461 return Encodings.PlainDict 462 } 463 return Encodings.RLEDict 464 } 465 466 // DictionaryPageEncoding returns the encoding that will be utilized for the DictionaryPage itself based on the parquet 467 // version. V1 uses PlainDict, v2 uses Plain 468 func (w *WriterProperties) DictionaryPageEncoding() Encoding { 469 if w.parquetVersion == V1_0 { 470 return Encodings.PlainDict 471 } 472 return Encodings.Plain 473 } 474 475 // DictionaryEnabled returns the default value as for whether or not dictionary encoding will be utilized for columns 476 // that aren't separately specified. 477 func (w *WriterProperties) DictionaryEnabled() bool { return w.defColumnProps.DictionaryEnabled } 478 479 // DictionaryEnabledFor returns whether or not dictionary encoding will be used for the specified column when writing 480 // or the default value if the column was not separately specified. 481 func (w *WriterProperties) DictionaryEnabledFor(path string) bool { 482 if p, ok := w.columnProps[path]; ok { 483 return p.DictionaryEnabled 484 } 485 return w.defColumnProps.DictionaryEnabled 486 } 487 488 // DictionaryEnabledPath is the same as DictionaryEnabledFor but takes a ColumnPath object. 489 func (w *WriterProperties) DictionaryEnabledPath(path ColumnPath) bool { 490 return w.DictionaryEnabledFor(path.String()) 491 } 492 493 // StatisticsEnabled returns the default value for whether or not stats are enabled to be written for columns 494 // that aren't separately specified. 495 func (w *WriterProperties) StatisticsEnabled() bool { return w.defColumnProps.StatsEnabled } 496 497 // StatisticsEnabledFor returns whether stats will be written for the given column path, or the default value if 498 // it wasn't separately specified. 499 func (w *WriterProperties) StatisticsEnabledFor(path string) bool { 500 if p, ok := w.columnProps[path]; ok { 501 return p.StatsEnabled 502 } 503 return w.defColumnProps.StatsEnabled 504 } 505 506 // StatisticsEnabledPath is the same as StatisticsEnabledFor but takes a ColumnPath object. 507 func (w *WriterProperties) StatisticsEnabledPath(path ColumnPath) bool { 508 return w.StatisticsEnabledFor(path.String()) 509 } 510 511 // MaxStatsSize returns the default maximum size for stats 512 func (w *WriterProperties) MaxStatsSize() int64 { return w.defColumnProps.MaxStatsSize } 513 514 // MaxStatsSizeFor returns the maximum stat size for the given column path 515 func (w *WriterProperties) MaxStatsSizeFor(path string) int64 { 516 if p, ok := w.columnProps[path]; ok { 517 return p.MaxStatsSize 518 } 519 return w.defColumnProps.MaxStatsSize 520 } 521 522 // MaxStatsSizePath is the same as MaxStatsSizeFor but takes a ColumnPath 523 func (w *WriterProperties) MaxStatsSizePath(path ColumnPath) int64 { 524 return w.MaxStatsSizeFor(path.String()) 525 } 526 527 // ColumnEncryptionProperties returns the specific properties for encryption that will be used for the given column path 528 func (w *WriterProperties) ColumnEncryptionProperties(path string) *ColumnEncryptionProperties { 529 if w.encryptionProps != nil { 530 return w.encryptionProps.ColumnEncryptionProperties(path) 531 } 532 return nil 533 }