github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/config.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "strings" 6 7 "github.com/vc42/parquet-go/compress" 8 ) 9 10 const ( 11 DefaultCreatedBy = "github.com/vc42/parquet-go" 12 DefaultColumnIndexSizeLimit = 16 13 DefaultColumnBufferCapacity = 16 * 1024 14 DefaultPageBufferSize = 256 * 1024 15 DefaultWriteBufferSize = 32 * 1024 16 DefaultDataPageVersion = 2 17 DefaultDataPageStatistics = false 18 DefaultSkipPageIndex = false 19 DefaultSkipBloomFilters = false 20 ) 21 22 // The FileConfig type carries configuration options for parquet files. 23 // 24 // FileConfig implements the FileOption interface so it can be used directly 25 // as argument to the OpenFile function when needed, for example: 26 // 27 // f, err := parquet.OpenFile(reader, size, &parquet.FileConfig{ 28 // SkipPageIndex: true, 29 // SkipBloomFilters: true, 30 // }) 31 // 32 type FileConfig struct { 33 SkipPageIndex bool 34 SkipBloomFilters bool 35 } 36 37 // DefaultFileConfig returns a new FileConfig value initialized with the 38 // default file configuration. 39 func DefaultFileConfig() *FileConfig { 40 return &FileConfig{ 41 SkipPageIndex: DefaultSkipPageIndex, 42 SkipBloomFilters: DefaultSkipBloomFilters, 43 } 44 } 45 46 // NewFileConfig constructs a new file configuration applying the options passed 47 // as arguments. 48 // 49 // The function returns an non-nil error if some of the options carried invalid 50 // configuration values. 51 func NewFileConfig(options ...FileOption) (*FileConfig, error) { 52 config := DefaultFileConfig() 53 config.Apply(options...) 54 return config, config.Validate() 55 } 56 57 // Apply applies the given list of options to c. 58 func (c *FileConfig) Apply(options ...FileOption) { 59 for _, opt := range options { 60 opt.ConfigureFile(c) 61 } 62 } 63 64 // ConfigureFile applies configuration options from c to config. 65 func (c *FileConfig) ConfigureFile(config *FileConfig) { 66 *config = FileConfig{ 67 SkipPageIndex: config.SkipPageIndex, 68 SkipBloomFilters: config.SkipBloomFilters, 69 } 70 } 71 72 // Validate returns a non-nil error if the configuration of c is invalid. 73 func (c *FileConfig) Validate() error { 74 return nil 75 } 76 77 // The ReaderConfig type carries configuration options for parquet readers. 78 // 79 // ReaderConfig implements the ReaderOption interface so it can be used directly 80 // as argument to the NewReader function when needed, for example: 81 // 82 // reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{ 83 // // ... 84 // }) 85 // 86 type ReaderConfig struct { 87 Schema *Schema 88 } 89 90 // DefaultReaderConfig returns a new ReaderConfig value initialized with the 91 // default reader configuration. 92 func DefaultReaderConfig() *ReaderConfig { 93 return &ReaderConfig{} 94 } 95 96 // NewReaderConfig constructs a new reader configuration applying the options 97 // passed as arguments. 98 // 99 // The function returns an non-nil error if some of the options carried invalid 100 // configuration values. 101 func NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error) { 102 config := DefaultReaderConfig() 103 config.Apply(options...) 104 return config, config.Validate() 105 } 106 107 // Apply applies the given list of options to c. 108 func (c *ReaderConfig) Apply(options ...ReaderOption) { 109 for _, opt := range options { 110 opt.ConfigureReader(c) 111 } 112 } 113 114 // ConfigureReader applies configuration options from c to config. 115 func (c *ReaderConfig) ConfigureReader(config *ReaderConfig) { 116 *config = ReaderConfig{ 117 Schema: coalesceSchema(c.Schema, config.Schema), 118 } 119 } 120 121 // Validate returns a non-nil error if the configuration of c is invalid. 122 func (c *ReaderConfig) Validate() error { 123 return nil 124 } 125 126 // The WriterConfig type carries configuration options for parquet writers. 127 // 128 // WriterConfig implements the WriterOption interface so it can be used directly 129 // as argument to the NewWriter function when needed, for example: 130 // 131 // writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{ 132 // CreatedBy: "my test program", 133 // }) 134 // 135 type WriterConfig struct { 136 CreatedBy string 137 ColumnPageBuffers PageBufferPool 138 ColumnIndexSizeLimit int 139 PageBufferSize int 140 WriteBufferSize int 141 DataPageVersion int 142 DataPageStatistics bool 143 KeyValueMetadata map[string]string 144 Schema *Schema 145 SortingColumns []SortingColumn 146 BloomFilters []BloomFilterColumn 147 Compression compress.Codec 148 } 149 150 // DefaultWriterConfig returns a new WriterConfig value initialized with the 151 // default writer configuration. 152 func DefaultWriterConfig() *WriterConfig { 153 return &WriterConfig{ 154 CreatedBy: DefaultCreatedBy, 155 ColumnPageBuffers: &defaultPageBufferPool, 156 ColumnIndexSizeLimit: DefaultColumnIndexSizeLimit, 157 PageBufferSize: DefaultPageBufferSize, 158 WriteBufferSize: DefaultWriteBufferSize, 159 DataPageVersion: DefaultDataPageVersion, 160 DataPageStatistics: DefaultDataPageStatistics, 161 } 162 } 163 164 // NewWriterConfig constructs a new writer configuration applying the options 165 // passed as arguments. 166 // 167 // The function returns an non-nil error if some of the options carried invalid 168 // configuration values. 169 func NewWriterConfig(options ...WriterOption) (*WriterConfig, error) { 170 config := DefaultWriterConfig() 171 config.Apply(options...) 172 return config, config.Validate() 173 } 174 175 // Apply applies the given list of options to c. 176 func (c *WriterConfig) Apply(options ...WriterOption) { 177 for _, opt := range options { 178 opt.ConfigureWriter(c) 179 } 180 } 181 182 // ConfigureWriter applies configuration options from c to config. 183 func (c *WriterConfig) ConfigureWriter(config *WriterConfig) { 184 keyValueMetadata := config.KeyValueMetadata 185 if len(c.KeyValueMetadata) > 0 { 186 if keyValueMetadata == nil { 187 keyValueMetadata = make(map[string]string, len(c.KeyValueMetadata)) 188 } 189 for k, v := range c.KeyValueMetadata { 190 keyValueMetadata[k] = v 191 } 192 } 193 *config = WriterConfig{ 194 CreatedBy: coalesceString(c.CreatedBy, config.CreatedBy), 195 ColumnPageBuffers: coalescePageBufferPool(c.ColumnPageBuffers, config.ColumnPageBuffers), 196 ColumnIndexSizeLimit: coalesceInt(c.ColumnIndexSizeLimit, config.ColumnIndexSizeLimit), 197 PageBufferSize: coalesceInt(c.PageBufferSize, config.PageBufferSize), 198 WriteBufferSize: coalesceInt(c.WriteBufferSize, config.WriteBufferSize), 199 DataPageVersion: coalesceInt(c.DataPageVersion, config.DataPageVersion), 200 DataPageStatistics: config.DataPageStatistics, 201 KeyValueMetadata: keyValueMetadata, 202 Schema: coalesceSchema(c.Schema, config.Schema), 203 SortingColumns: coalesceSortingColumns(c.SortingColumns, config.SortingColumns), 204 BloomFilters: coalesceBloomFilters(c.BloomFilters, config.BloomFilters), 205 Compression: coalesceCompression(c.Compression, config.Compression), 206 } 207 } 208 209 // Validate returns a non-nil error if the configuration of c is invalid. 210 func (c *WriterConfig) Validate() error { 211 const baseName = "parquet.(*WriterConfig)." 212 return errorInvalidConfiguration( 213 validateNotNil(baseName+"ColumnPageBuffers", c.ColumnPageBuffers), 214 validatePositiveInt(baseName+"ColumnIndexSizeLimit", c.ColumnIndexSizeLimit), 215 validatePositiveInt(baseName+"PageBufferSize", c.PageBufferSize), 216 validateOneOfInt(baseName+"DataPageVersion", c.DataPageVersion, 1, 2), 217 ) 218 } 219 220 // The RowGroupConfig type carries configuration options for parquet row groups. 221 // 222 // RowGroupConfig implements the RowGroupOption interface so it can be used 223 // directly as argument to the NewBuffer function when needed, for example: 224 // 225 // buffer := parquet.NewBuffer(&parquet.RowGroupConfig{ 226 // ColumnBufferCapacity: 10_000, 227 // }) 228 // 229 type RowGroupConfig struct { 230 ColumnBufferCapacity int 231 SortingColumns []SortingColumn 232 Schema *Schema 233 } 234 235 // DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the 236 // default row group configuration. 237 func DefaultRowGroupConfig() *RowGroupConfig { 238 return &RowGroupConfig{ 239 ColumnBufferCapacity: DefaultColumnBufferCapacity, 240 } 241 } 242 243 // NewRowGroupConfig constructs a new row group configuration applying the 244 // options passed as arguments. 245 // 246 // The function returns an non-nil error if some of the options carried invalid 247 // configuration values. 248 func NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error) { 249 config := DefaultRowGroupConfig() 250 config.Apply(options...) 251 return config, config.Validate() 252 } 253 254 // Validate returns a non-nil error if the configuration of c is invalid. 255 func (c *RowGroupConfig) Validate() error { 256 const baseName = "parquet.(*RowGroupConfig)." 257 return errorInvalidConfiguration( 258 validatePositiveInt(baseName+"ColumnBufferCapacity", c.ColumnBufferCapacity), 259 ) 260 } 261 262 func (c *RowGroupConfig) Apply(options ...RowGroupOption) { 263 for _, opt := range options { 264 opt.ConfigureRowGroup(c) 265 } 266 } 267 268 func (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig) { 269 *config = RowGroupConfig{ 270 ColumnBufferCapacity: coalesceInt(c.ColumnBufferCapacity, config.ColumnBufferCapacity), 271 SortingColumns: coalesceSortingColumns(c.SortingColumns, config.SortingColumns), 272 Schema: coalesceSchema(c.Schema, config.Schema), 273 } 274 } 275 276 // FileOption is an interface implemented by types that carry configuration 277 // options for parquet files. 278 type FileOption interface { 279 ConfigureFile(*FileConfig) 280 } 281 282 // ReaderOption is an interface implemented by types that carry configuration 283 // options for parquet readers. 284 type ReaderOption interface { 285 ConfigureReader(*ReaderConfig) 286 } 287 288 // WriterOption is an interface implemented by types that carry configuration 289 // options for parquet writers. 290 type WriterOption interface { 291 ConfigureWriter(*WriterConfig) 292 } 293 294 // RowGroupOption is an interface implemented by types that carry configuration 295 // options for parquet row groups. 296 type RowGroupOption interface { 297 ConfigureRowGroup(*RowGroupConfig) 298 } 299 300 // SkipPageIndex is a file configuration option which prevents automatically 301 // reading the page index when opening a parquet file, when set to true. This is 302 // useful as an optimization when programs know that they will not need to 303 // consume the page index. 304 // 305 // Defaults to false. 306 func SkipPageIndex(skip bool) FileOption { 307 return fileOption(func(config *FileConfig) { config.SkipPageIndex = skip }) 308 } 309 310 // SkipBloomFilters is a file configuration option which prevents automatically 311 // reading the bloom filters when opening a parquet file, when set to true. 312 // This is useful as an optimization when programs know that they will not need 313 // to consume the bloom filters. 314 // 315 // Defaults to false. 316 func SkipBloomFilters(skip bool) FileOption { 317 return fileOption(func(config *FileConfig) { config.SkipBloomFilters = skip }) 318 } 319 320 // PageBufferSize configures the size of column page buffers on parquet writers. 321 // 322 // Note that the page buffer size refers to the in-memory buffers where pages 323 // are generated, not the size of pages after encoding and compression. 324 // This design choice was made to help control the amount of memory needed to 325 // read and write pages rather than controlling the space used by the encoded 326 // representation on disk. 327 // 328 // Defaults to 256KiB. 329 func PageBufferSize(size int) WriterOption { 330 return writerOption(func(config *WriterConfig) { config.PageBufferSize = size }) 331 } 332 333 // WriteBufferSize configures the size of the write buffer. 334 // 335 // Setting the writer buffer size to zero deactivates buffering, all writes are 336 // immediately sent to the output io.Writer. 337 // 338 // Defaults to 32KiB. 339 func WriteBufferSize(size int) WriterOption { 340 return writerOption(func(config *WriterConfig) { config.WriteBufferSize = size }) 341 } 342 343 // CreatedBy creates a configuration option which sets the name of the 344 // application that created a parquet file. 345 // 346 // By default, this information is omitted. 347 func CreatedBy(createdBy string) WriterOption { 348 return writerOption(func(config *WriterConfig) { config.CreatedBy = createdBy }) 349 } 350 351 // ColumnPageBuffers creates a configuration option to customize the buffer pool 352 // used when constructing row groups. This can be used to provide on-disk buffers 353 // as swap space to ensure that the parquet file creation will no be bottlenecked 354 // on the amount of memory available. 355 // 356 // Defaults to using in-memory buffers. 357 func ColumnPageBuffers(buffers PageBufferPool) WriterOption { 358 return writerOption(func(config *WriterConfig) { config.ColumnPageBuffers = buffers }) 359 } 360 361 // ColumnIndexSizeLimit creates a configuration option to customize the size 362 // limit of page boundaries recorded in column indexes. 363 // 364 // Defaults to 16. 365 func ColumnIndexSizeLimit(sizeLimit int) WriterOption { 366 return writerOption(func(config *WriterConfig) { config.ColumnIndexSizeLimit = sizeLimit }) 367 } 368 369 // DataPageVersion creates a configuration option which configures the version of 370 // data pages used when creating a parquet file. 371 // 372 // Defaults to version 2. 373 func DataPageVersion(version int) WriterOption { 374 return writerOption(func(config *WriterConfig) { config.DataPageVersion = version }) 375 } 376 377 // DataPageStatistics creates a configuration option which defines whether data 378 // page statistics are emitted. This option is useful when generating parquet 379 // files that intend to be backward compatible with older readers which may not 380 // have the ability to load page statistics from the column index. 381 // 382 // Defaults to false. 383 func DataPageStatistics(enabled bool) WriterOption { 384 return writerOption(func(config *WriterConfig) { config.DataPageStatistics = enabled }) 385 } 386 387 // KeyValueMetadata creates a configuration option which adds key/value metadata 388 // to add to the metadata of parquet files. 389 // 390 // This option is additive, it may be used multiple times to add more than one 391 // key/value pair. 392 // 393 // Keys are assumed to be unique, if the same key is repeated multiple times the 394 // last value is retained. While the parquet format does not require unique keys, 395 // this design decision was made to optimize for the most common use case where 396 // applications leverage this extension mechanism to associate single values to 397 // keys. This may create incompatibilities with other parquet libraries, or may 398 // cause some key/value pairs to be lost when open parquet files written with 399 // repeated keys. We can revisit this decision if it ever becomes a blocker. 400 func KeyValueMetadata(key, value string) WriterOption { 401 return writerOption(func(config *WriterConfig) { 402 if config.KeyValueMetadata == nil { 403 config.KeyValueMetadata = map[string]string{key: value} 404 } else { 405 config.KeyValueMetadata[key] = value 406 } 407 }) 408 } 409 410 // BloomFilters creates a configuration option which defines the bloom filters 411 // that parquet writers should generate. 412 // 413 // The compute and memory footprint of generating bloom filters for all columns 414 // of a parquet schema can be significant, so by default no filters are created 415 // and applications need to explicitly declare the columns that they want to 416 // create filters for. 417 func BloomFilters(filters ...BloomFilterColumn) WriterOption { 418 filters = append([]BloomFilterColumn{}, filters...) 419 return writerOption(func(config *WriterConfig) { config.BloomFilters = filters }) 420 } 421 422 // Compression creates a configuration option which sets the default compression 423 // codec used by a writer for columns where none were defined. 424 func Compression(codec compress.Codec) WriterOption { 425 return writerOption(func(config *WriterConfig) { config.Compression = codec }) 426 } 427 428 // ColumnBufferCapacity creates a configuration option which defines the size of 429 // row group column buffers. 430 // 431 // Defaults to 16384. 432 func ColumnBufferCapacity(size int) RowGroupOption { 433 return rowGroupOption(func(config *RowGroupConfig) { config.ColumnBufferCapacity = size }) 434 } 435 436 // SortingColumns creates a configuration option which defines the sorting order 437 // of columns in a row group. 438 // 439 // The order of sorting columns passed as argument defines the ordering 440 // hierarchy; when elements are equal in the first column, the second column is 441 // used to order rows, etc... 442 func SortingColumns(columns ...SortingColumn) interface { 443 RowGroupOption 444 WriterOption 445 } { 446 // Make a copy so that we do not retain the input slice generated implicitly 447 // for the variable argument list, and also avoid having a nil slice when 448 // the option is passed with no sorting columns, so we can differentiate it 449 // from it not being passed. 450 columns = append([]SortingColumn{}, columns...) 451 return sortingColumns(columns) 452 } 453 454 type sortingColumns []SortingColumn 455 456 func (columns sortingColumns) ConfigureRowGroup(config *RowGroupConfig) { 457 config.SortingColumns = columns 458 } 459 460 func (columns sortingColumns) ConfigureWriter(config *WriterConfig) { 461 config.SortingColumns = columns 462 } 463 464 type fileOption func(*FileConfig) 465 466 func (opt fileOption) ConfigureFile(config *FileConfig) { opt(config) } 467 468 type readerOption func(*ReaderConfig) 469 470 func (opt readerOption) ConfigureReader(config *ReaderConfig) { opt(config) } 471 472 type writerOption func(*WriterConfig) 473 474 func (opt writerOption) ConfigureWriter(config *WriterConfig) { opt(config) } 475 476 type rowGroupOption func(*RowGroupConfig) 477 478 func (opt rowGroupOption) ConfigureRowGroup(config *RowGroupConfig) { opt(config) } 479 480 func coalesceInt(i1, i2 int) int { 481 if i1 != 0 { 482 return i1 483 } 484 return i2 485 } 486 487 func coalesceInt64(i1, i2 int64) int64 { 488 if i1 != 0 { 489 return i1 490 } 491 return i2 492 } 493 494 func coalesceString(s1, s2 string) string { 495 if s1 != "" { 496 return s1 497 } 498 return s2 499 } 500 501 func coalesceBytes(b1, b2 []byte) []byte { 502 if b1 != nil { 503 return b1 504 } 505 return b2 506 } 507 508 func coalescePageBufferPool(p1, p2 PageBufferPool) PageBufferPool { 509 if p1 != nil { 510 return p1 511 } 512 return p2 513 } 514 515 func coalesceSchema(s1, s2 *Schema) *Schema { 516 if s1 != nil { 517 return s1 518 } 519 return s2 520 } 521 522 func coalesceSortingColumns(s1, s2 []SortingColumn) []SortingColumn { 523 if s1 != nil { 524 return s1 525 } 526 return s2 527 } 528 529 func coalesceBloomFilters(f1, f2 []BloomFilterColumn) []BloomFilterColumn { 530 if f1 != nil { 531 return f1 532 } 533 return f2 534 } 535 536 func coalesceCompression(c1, c2 compress.Codec) compress.Codec { 537 if c1 != nil { 538 return c1 539 } 540 return c2 541 } 542 543 func validatePositiveInt(optionName string, optionValue int) error { 544 if optionValue > 0 { 545 return nil 546 } 547 return errorInvalidOptionValue(optionName, optionValue) 548 } 549 550 func validatePositiveInt64(optionName string, optionValue int64) error { 551 if optionValue > 0 { 552 return nil 553 } 554 return errorInvalidOptionValue(optionName, optionValue) 555 } 556 557 func validateOneOfInt(optionName string, optionValue int, supportedValues ...int) error { 558 for _, value := range supportedValues { 559 if value == optionValue { 560 return nil 561 } 562 } 563 return errorInvalidOptionValue(optionName, optionValue) 564 } 565 566 func validateNotNil(optionName string, optionValue interface{}) error { 567 if optionValue != nil { 568 return nil 569 } 570 return errorInvalidOptionValue(optionName, optionValue) 571 } 572 573 func errorInvalidOptionValue(optionName string, optionValue interface{}) error { 574 return fmt.Errorf("invalid option value: %s: %v", optionName, optionValue) 575 } 576 577 func errorInvalidConfiguration(reasons ...error) error { 578 var err *invalidConfiguration 579 580 for _, reason := range reasons { 581 if reason != nil { 582 if err == nil { 583 err = new(invalidConfiguration) 584 } 585 err.reasons = append(err.reasons, reason) 586 } 587 } 588 589 if err != nil { 590 return err 591 } 592 593 return nil 594 } 595 596 type invalidConfiguration struct { 597 reasons []error 598 } 599 600 func (err *invalidConfiguration) Error() string { 601 errorMessage := new(strings.Builder) 602 for _, reason := range err.reasons { 603 errorMessage.WriteString(reason.Error()) 604 errorMessage.WriteString("\n") 605 } 606 errorString := errorMessage.String() 607 if errorString != "" { 608 errorString = errorString[:len(errorString)-1] 609 } 610 return errorString 611 } 612 613 var ( 614 _ FileOption = (*FileConfig)(nil) 615 _ ReaderOption = (*ReaderConfig)(nil) 616 _ WriterOption = (*WriterConfig)(nil) 617 _ RowGroupOption = (*RowGroupConfig)(nil) 618 )