github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/config.go (about) 1 package parquet 2 3 import ( 4 "fmt" 5 "math" 6 "runtime/debug" 7 "strings" 8 "sync" 9 10 "github.com/segmentio/parquet-go/compress" 11 ) 12 13 // ReadMode is an enum that is used to configure the way that a File reads pages. 14 type ReadMode int 15 16 const ( 17 ReadModeSync ReadMode = iota // ReadModeSync reads pages synchronously on demand (Default). 18 ReadModeAsync // ReadModeAsync reads pages asynchronously in the background. 19 ) 20 21 const ( 22 DefaultColumnIndexSizeLimit = 16 23 DefaultColumnBufferCapacity = 16 * 1024 24 DefaultPageBufferSize = 256 * 1024 25 DefaultWriteBufferSize = 32 * 1024 26 DefaultDataPageVersion = 2 27 DefaultDataPageStatistics = false 28 DefaultSkipPageIndex = false 29 DefaultSkipBloomFilters = false 30 DefaultMaxRowsPerRowGroup = math.MaxInt64 31 DefaultReadMode = ReadModeSync 32 ) 33 34 const ( 35 parquetGoModulePath = "github.com/segmentio/parquet-go" 36 ) 37 38 var ( 39 defaultCreatedByInfo string 40 defaultCreatedByOnce sync.Once 41 ) 42 43 func defaultCreatedBy() string { 44 defaultCreatedByOnce.Do(func() { 45 createdBy := parquetGoModulePath 46 build, ok := debug.ReadBuildInfo() 47 if ok { 48 for _, mod := range build.Deps { 49 if mod.Replace == nil && mod.Path == parquetGoModulePath { 50 semver, _, buildsha := parseModuleVersion(mod.Version) 51 createdBy = formatCreatedBy(createdBy, semver, buildsha) 52 break 53 } 54 } 55 } 56 defaultCreatedByInfo = createdBy 57 }) 58 return defaultCreatedByInfo 59 } 60 61 func parseModuleVersion(version string) (semver, datetime, buildsha string) { 62 semver, version = splitModuleVersion(version) 63 datetime, version = splitModuleVersion(version) 64 buildsha, _ = splitModuleVersion(version) 65 semver = strings.TrimPrefix(semver, "v") 66 return 67 } 68 69 func splitModuleVersion(s string) (head, tail string) { 70 if i := strings.IndexByte(s, '-'); i < 0 { 71 head = s 72 } else { 73 head, tail = s[:i], s[i+1:] 74 } 75 return 76 } 77 78 func formatCreatedBy(application, version, build string) string { 79 return application + " version " + version + "(build " + build + ")" 80 } 81 82 // The FileConfig type carries configuration options for parquet files. 83 // 84 // FileConfig implements the FileOption interface so it can be used directly 85 // as argument to the OpenFile function when needed, for example: 86 // 87 // f, err := parquet.OpenFile(reader, size, &parquet.FileConfig{ 88 // SkipPageIndex: true, 89 // SkipBloomFilters: true, 90 // ReadMode: ReadModeAsync, 91 // }) 92 type FileConfig struct { 93 SkipPageIndex bool 94 SkipBloomFilters bool 95 ReadBufferSize int 96 ReadMode ReadMode 97 Schema *Schema 98 } 99 100 // DefaultFileConfig returns a new FileConfig value initialized with the 101 // default file configuration. 102 func DefaultFileConfig() *FileConfig { 103 return &FileConfig{ 104 SkipPageIndex: DefaultSkipPageIndex, 105 SkipBloomFilters: DefaultSkipBloomFilters, 106 ReadBufferSize: defaultReadBufferSize, 107 ReadMode: DefaultReadMode, 108 Schema: nil, 109 } 110 } 111 112 // NewFileConfig constructs a new file configuration applying the options passed 113 // as arguments. 114 // 115 // The function returns an non-nil error if some of the options carried invalid 116 // configuration values. 117 func NewFileConfig(options ...FileOption) (*FileConfig, error) { 118 config := DefaultFileConfig() 119 config.Apply(options...) 120 return config, config.Validate() 121 } 122 123 // Apply applies the given list of options to c. 124 func (c *FileConfig) Apply(options ...FileOption) { 125 for _, opt := range options { 126 opt.ConfigureFile(c) 127 } 128 } 129 130 // ConfigureFile applies configuration options from c to config. 131 func (c *FileConfig) ConfigureFile(config *FileConfig) { 132 *config = FileConfig{ 133 SkipPageIndex: c.SkipPageIndex, 134 SkipBloomFilters: c.SkipBloomFilters, 135 ReadBufferSize: coalesceInt(c.ReadBufferSize, config.ReadBufferSize), 136 ReadMode: ReadMode(coalesceInt(int(c.ReadMode), int(config.ReadMode))), 137 Schema: coalesceSchema(c.Schema, config.Schema), 138 } 139 } 140 141 // Validate returns a non-nil error if the configuration of c is invalid. 142 func (c *FileConfig) Validate() error { 143 return nil 144 } 145 146 // The ReaderConfig type carries configuration options for parquet readers. 147 // 148 // ReaderConfig implements the ReaderOption interface so it can be used directly 149 // as argument to the NewReader function when needed, for example: 150 // 151 // reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{ 152 // // ... 153 // }) 154 type ReaderConfig struct { 155 Schema *Schema 156 } 157 158 // DefaultReaderConfig returns a new ReaderConfig value initialized with the 159 // default reader configuration. 160 func DefaultReaderConfig() *ReaderConfig { 161 return &ReaderConfig{} 162 } 163 164 // NewReaderConfig constructs a new reader configuration applying the options 165 // passed as arguments. 166 // 167 // The function returns an non-nil error if some of the options carried invalid 168 // configuration values. 169 func NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error) { 170 config := DefaultReaderConfig() 171 config.Apply(options...) 172 return config, config.Validate() 173 } 174 175 // Apply applies the given list of options to c. 176 func (c *ReaderConfig) Apply(options ...ReaderOption) { 177 for _, opt := range options { 178 opt.ConfigureReader(c) 179 } 180 } 181 182 // ConfigureReader applies configuration options from c to config. 183 func (c *ReaderConfig) ConfigureReader(config *ReaderConfig) { 184 *config = ReaderConfig{ 185 Schema: coalesceSchema(c.Schema, config.Schema), 186 } 187 } 188 189 // Validate returns a non-nil error if the configuration of c is invalid. 190 func (c *ReaderConfig) Validate() error { 191 return nil 192 } 193 194 // The WriterConfig type carries configuration options for parquet writers. 195 // 196 // WriterConfig implements the WriterOption interface so it can be used directly 197 // as argument to the NewWriter function when needed, for example: 198 // 199 // writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{ 200 // CreatedBy: "my test program", 201 // }) 202 type WriterConfig struct { 203 CreatedBy string 204 ColumnPageBuffers BufferPool 205 ColumnIndexSizeLimit int 206 PageBufferSize int 207 WriteBufferSize int 208 DataPageVersion int 209 DataPageStatistics bool 210 MaxRowsPerRowGroup int64 211 KeyValueMetadata map[string]string 212 Schema *Schema 213 BloomFilters []BloomFilterColumn 214 Compression compress.Codec 215 Sorting SortingConfig 216 } 217 218 // DefaultWriterConfig returns a new WriterConfig value initialized with the 219 // default writer configuration. 220 func DefaultWriterConfig() *WriterConfig { 221 return &WriterConfig{ 222 CreatedBy: defaultCreatedBy(), 223 ColumnPageBuffers: &defaultColumnBufferPool, 224 ColumnIndexSizeLimit: DefaultColumnIndexSizeLimit, 225 PageBufferSize: DefaultPageBufferSize, 226 WriteBufferSize: DefaultWriteBufferSize, 227 DataPageVersion: DefaultDataPageVersion, 228 DataPageStatistics: DefaultDataPageStatistics, 229 MaxRowsPerRowGroup: DefaultMaxRowsPerRowGroup, 230 Sorting: SortingConfig{ 231 SortingBuffers: &defaultSortingBufferPool, 232 }, 233 } 234 } 235 236 // NewWriterConfig constructs a new writer configuration applying the options 237 // passed as arguments. 238 // 239 // The function returns an non-nil error if some of the options carried invalid 240 // configuration values. 241 func NewWriterConfig(options ...WriterOption) (*WriterConfig, error) { 242 config := DefaultWriterConfig() 243 config.Apply(options...) 244 return config, config.Validate() 245 } 246 247 // Apply applies the given list of options to c. 248 func (c *WriterConfig) Apply(options ...WriterOption) { 249 for _, opt := range options { 250 opt.ConfigureWriter(c) 251 } 252 } 253 254 // ConfigureWriter applies configuration options from c to config. 255 func (c *WriterConfig) ConfigureWriter(config *WriterConfig) { 256 keyValueMetadata := config.KeyValueMetadata 257 if len(c.KeyValueMetadata) > 0 { 258 if keyValueMetadata == nil { 259 keyValueMetadata = make(map[string]string, len(c.KeyValueMetadata)) 260 } 261 for k, v := range c.KeyValueMetadata { 262 keyValueMetadata[k] = v 263 } 264 } 265 266 *config = WriterConfig{ 267 CreatedBy: coalesceString(c.CreatedBy, config.CreatedBy), 268 ColumnPageBuffers: coalesceBufferPool(c.ColumnPageBuffers, config.ColumnPageBuffers), 269 ColumnIndexSizeLimit: coalesceInt(c.ColumnIndexSizeLimit, config.ColumnIndexSizeLimit), 270 PageBufferSize: coalesceInt(c.PageBufferSize, config.PageBufferSize), 271 WriteBufferSize: coalesceInt(c.WriteBufferSize, config.WriteBufferSize), 272 DataPageVersion: coalesceInt(c.DataPageVersion, config.DataPageVersion), 273 DataPageStatistics: config.DataPageStatistics, 274 MaxRowsPerRowGroup: config.MaxRowsPerRowGroup, 275 KeyValueMetadata: keyValueMetadata, 276 Schema: coalesceSchema(c.Schema, config.Schema), 277 BloomFilters: coalesceBloomFilters(c.BloomFilters, config.BloomFilters), 278 Compression: coalesceCompression(c.Compression, config.Compression), 279 Sorting: coalesceSortingConfig(c.Sorting, config.Sorting), 280 } 281 } 282 283 // Validate returns a non-nil error if the configuration of c is invalid. 284 func (c *WriterConfig) Validate() error { 285 const baseName = "parquet.(*WriterConfig)." 286 return errorInvalidConfiguration( 287 validateNotNil(baseName+"ColumnPageBuffers", c.ColumnPageBuffers), 288 validatePositiveInt(baseName+"ColumnIndexSizeLimit", c.ColumnIndexSizeLimit), 289 validatePositiveInt(baseName+"PageBufferSize", c.PageBufferSize), 290 validateOneOfInt(baseName+"DataPageVersion", c.DataPageVersion, 1, 2), 291 c.Sorting.Validate(), 292 ) 293 } 294 295 // The RowGroupConfig type carries configuration options for parquet row groups. 296 // 297 // RowGroupConfig implements the RowGroupOption interface so it can be used 298 // directly as argument to the NewBuffer function when needed, for example: 299 // 300 // buffer := parquet.NewBuffer(&parquet.RowGroupConfig{ 301 // ColumnBufferCapacity: 10_000, 302 // }) 303 type RowGroupConfig struct { 304 ColumnBufferCapacity int 305 Schema *Schema 306 Sorting SortingConfig 307 } 308 309 // DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the 310 // default row group configuration. 311 func DefaultRowGroupConfig() *RowGroupConfig { 312 return &RowGroupConfig{ 313 ColumnBufferCapacity: DefaultColumnBufferCapacity, 314 Sorting: SortingConfig{ 315 SortingBuffers: &defaultSortingBufferPool, 316 }, 317 } 318 } 319 320 // NewRowGroupConfig constructs a new row group configuration applying the 321 // options passed as arguments. 322 // 323 // The function returns an non-nil error if some of the options carried invalid 324 // configuration values. 325 func NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error) { 326 config := DefaultRowGroupConfig() 327 config.Apply(options...) 328 return config, config.Validate() 329 } 330 331 // Validate returns a non-nil error if the configuration of c is invalid. 332 func (c *RowGroupConfig) Validate() error { 333 const baseName = "parquet.(*RowGroupConfig)." 334 return errorInvalidConfiguration( 335 validatePositiveInt(baseName+"ColumnBufferCapacity", c.ColumnBufferCapacity), 336 c.Sorting.Validate(), 337 ) 338 } 339 340 func (c *RowGroupConfig) Apply(options ...RowGroupOption) { 341 for _, opt := range options { 342 opt.ConfigureRowGroup(c) 343 } 344 } 345 346 func (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig) { 347 *config = RowGroupConfig{ 348 ColumnBufferCapacity: coalesceInt(c.ColumnBufferCapacity, config.ColumnBufferCapacity), 349 Schema: coalesceSchema(c.Schema, config.Schema), 350 Sorting: coalesceSortingConfig(c.Sorting, config.Sorting), 351 } 352 } 353 354 // The SortingConfig type carries configuration options for parquet row groups. 355 // 356 // SortingConfig implements the SortingOption interface so it can be used 357 // directly as argument to the NewSortingWriter function when needed, 358 // for example: 359 // 360 // buffer := parquet.NewSortingWriter[Row]( 361 // parquet.SortingWriterConfig( 362 // parquet.DropDuplicatedRows(true), 363 // ), 364 // }) 365 type SortingConfig struct { 366 SortingBuffers BufferPool 367 SortingColumns []SortingColumn 368 DropDuplicatedRows bool 369 } 370 371 // DefaultSortingConfig returns a new SortingConfig value initialized with the 372 // default row group configuration. 373 func DefaultSortingConfig() *SortingConfig { 374 return &SortingConfig{ 375 SortingBuffers: &defaultSortingBufferPool, 376 } 377 } 378 379 // NewSortingConfig constructs a new sorting configuration applying the 380 // options passed as arguments. 381 // 382 // The function returns an non-nil error if some of the options carried invalid 383 // configuration values. 384 func NewSortingConfig(options ...SortingOption) (*SortingConfig, error) { 385 config := DefaultSortingConfig() 386 config.Apply(options...) 387 return config, config.Validate() 388 } 389 390 func (c *SortingConfig) Validate() error { 391 const baseName = "parquet.(*SortingConfig)." 392 return errorInvalidConfiguration( 393 validateNotNil(baseName+"SortingBuffers", c.SortingBuffers), 394 ) 395 } 396 397 func (c *SortingConfig) Apply(options ...SortingOption) { 398 for _, opt := range options { 399 opt.ConfigureSorting(c) 400 } 401 } 402 403 func (c *SortingConfig) ConfigureSorting(config *SortingConfig) { 404 *config = coalesceSortingConfig(*c, *config) 405 } 406 407 // FileOption is an interface implemented by types that carry configuration 408 // options for parquet files. 409 type FileOption interface { 410 ConfigureFile(*FileConfig) 411 } 412 413 // ReaderOption is an interface implemented by types that carry configuration 414 // options for parquet readers. 415 type ReaderOption interface { 416 ConfigureReader(*ReaderConfig) 417 } 418 419 // WriterOption is an interface implemented by types that carry configuration 420 // options for parquet writers. 421 type WriterOption interface { 422 ConfigureWriter(*WriterConfig) 423 } 424 425 // RowGroupOption is an interface implemented by types that carry configuration 426 // options for parquet row groups. 427 type RowGroupOption interface { 428 ConfigureRowGroup(*RowGroupConfig) 429 } 430 431 // SortingOption is an interface implemented by types that carry configuration 432 // options for parquet sorting writers. 433 type SortingOption interface { 434 ConfigureSorting(*SortingConfig) 435 } 436 437 // SkipPageIndex is a file configuration option which prevents automatically 438 // reading the page index when opening a parquet file, when set to true. This is 439 // useful as an optimization when programs know that they will not need to 440 // consume the page index. 441 // 442 // Defaults to false. 443 func SkipPageIndex(skip bool) FileOption { 444 return fileOption(func(config *FileConfig) { config.SkipPageIndex = skip }) 445 } 446 447 // SkipBloomFilters is a file configuration option which prevents automatically 448 // reading the bloom filters when opening a parquet file, when set to true. 449 // This is useful as an optimization when programs know that they will not need 450 // to consume the bloom filters. 451 // 452 // Defaults to false. 453 func SkipBloomFilters(skip bool) FileOption { 454 return fileOption(func(config *FileConfig) { config.SkipBloomFilters = skip }) 455 } 456 457 // FileReadMode is a file configuration option which controls the way pages 458 // are read. Currently the only two options are ReadModeAsync and ReadModeSync 459 // which control whether or not pages are loaded asynchronously. It can be 460 // advantageous to use ReadModeAsync if your reader is backed by network 461 // storage. 462 // 463 // Defaults to ReadModeSync. 464 func FileReadMode(mode ReadMode) FileOption { 465 return fileOption(func(config *FileConfig) { config.ReadMode = mode }) 466 } 467 468 // ReadBufferSize is a file configuration option which controls the default 469 // buffer sizes for reads made to the provided io.Reader. The default of 4096 470 // is appropriate for disk based access but if your reader is backed by network 471 // storage it can be advantageous to increase this value to something more like 472 // 4 MiB. 473 // 474 // Defaults to 4096. 475 func ReadBufferSize(size int) FileOption { 476 return fileOption(func(config *FileConfig) { config.ReadBufferSize = size }) 477 } 478 479 // FileSchema is used to pass a known schema in while opening a Parquet file. 480 // This optimization is only useful if your application is currently opening 481 // an extremely large number of parquet files with the same, known schema. 482 // 483 // Defaults to nil. 484 func FileSchema(schema *Schema) FileOption { 485 return fileOption(func(config *FileConfig) { config.Schema = schema }) 486 } 487 488 // PageBufferSize configures the size of column page buffers on parquet writers. 489 // 490 // Note that the page buffer size refers to the in-memory buffers where pages 491 // are generated, not the size of pages after encoding and compression. 492 // This design choice was made to help control the amount of memory needed to 493 // read and write pages rather than controlling the space used by the encoded 494 // representation on disk. 495 // 496 // Defaults to 256KiB. 497 func PageBufferSize(size int) WriterOption { 498 return writerOption(func(config *WriterConfig) { config.PageBufferSize = size }) 499 } 500 501 // WriteBufferSize configures the size of the write buffer. 502 // 503 // Setting the writer buffer size to zero deactivates buffering, all writes are 504 // immediately sent to the output io.Writer. 505 // 506 // Defaults to 32KiB. 507 func WriteBufferSize(size int) WriterOption { 508 return writerOption(func(config *WriterConfig) { config.WriteBufferSize = size }) 509 } 510 511 // MaxRowsPerRowGroup configures the maximum number of rows that a writer will 512 // produce in each row group. 513 // 514 // This limit is useful to control size of row groups in both number of rows and 515 // byte size. While controlling the byte size of a row group is difficult to 516 // achieve with parquet due to column encoding and compression, the number of 517 // rows remains a useful proxy. 518 // 519 // Defaults to unlimited. 520 func MaxRowsPerRowGroup(numRows int64) WriterOption { 521 if numRows <= 0 { 522 numRows = DefaultMaxRowsPerRowGroup 523 } 524 return writerOption(func(config *WriterConfig) { config.MaxRowsPerRowGroup = numRows }) 525 } 526 527 // CreatedBy creates a configuration option which sets the name of the 528 // application that created a parquet file. 529 // 530 // The option formats the "CreatedBy" file metadata according to the convention 531 // described by the parquet spec: 532 // 533 // "<application> version <version> (build <build>)" 534 // 535 // By default, the option is set to the parquet-go module name, version, and 536 // build hash. 537 func CreatedBy(application, version, build string) WriterOption { 538 createdBy := formatCreatedBy(application, version, build) 539 return writerOption(func(config *WriterConfig) { config.CreatedBy = createdBy }) 540 } 541 542 // ColumnPageBuffers creates a configuration option to customize the buffer pool 543 // used when constructing row groups. This can be used to provide on-disk buffers 544 // as swap space to ensure that the parquet file creation will no be bottlenecked 545 // on the amount of memory available. 546 // 547 // Defaults to using in-memory buffers. 548 func ColumnPageBuffers(buffers BufferPool) WriterOption { 549 return writerOption(func(config *WriterConfig) { config.ColumnPageBuffers = buffers }) 550 } 551 552 // ColumnIndexSizeLimit creates a configuration option to customize the size 553 // limit of page boundaries recorded in column indexes. 554 // 555 // Defaults to 16. 556 func ColumnIndexSizeLimit(sizeLimit int) WriterOption { 557 return writerOption(func(config *WriterConfig) { config.ColumnIndexSizeLimit = sizeLimit }) 558 } 559 560 // DataPageVersion creates a configuration option which configures the version of 561 // data pages used when creating a parquet file. 562 // 563 // Defaults to version 2. 564 func DataPageVersion(version int) WriterOption { 565 return writerOption(func(config *WriterConfig) { config.DataPageVersion = version }) 566 } 567 568 // DataPageStatistics creates a configuration option which defines whether data 569 // page statistics are emitted. This option is useful when generating parquet 570 // files that intend to be backward compatible with older readers which may not 571 // have the ability to load page statistics from the column index. 572 // 573 // Defaults to false. 574 func DataPageStatistics(enabled bool) WriterOption { 575 return writerOption(func(config *WriterConfig) { config.DataPageStatistics = enabled }) 576 } 577 578 // KeyValueMetadata creates a configuration option which adds key/value metadata 579 // to add to the metadata of parquet files. 580 // 581 // This option is additive, it may be used multiple times to add more than one 582 // key/value pair. 583 // 584 // Keys are assumed to be unique, if the same key is repeated multiple times the 585 // last value is retained. While the parquet format does not require unique keys, 586 // this design decision was made to optimize for the most common use case where 587 // applications leverage this extension mechanism to associate single values to 588 // keys. This may create incompatibilities with other parquet libraries, or may 589 // cause some key/value pairs to be lost when open parquet files written with 590 // repeated keys. We can revisit this decision if it ever becomes a blocker. 591 func KeyValueMetadata(key, value string) WriterOption { 592 return writerOption(func(config *WriterConfig) { 593 if config.KeyValueMetadata == nil { 594 config.KeyValueMetadata = map[string]string{key: value} 595 } else { 596 config.KeyValueMetadata[key] = value 597 } 598 }) 599 } 600 601 // BloomFilters creates a configuration option which defines the bloom filters 602 // that parquet writers should generate. 603 // 604 // The compute and memory footprint of generating bloom filters for all columns 605 // of a parquet schema can be significant, so by default no filters are created 606 // and applications need to explicitly declare the columns that they want to 607 // create filters for. 608 func BloomFilters(filters ...BloomFilterColumn) WriterOption { 609 filters = append([]BloomFilterColumn{}, filters...) 610 return writerOption(func(config *WriterConfig) { config.BloomFilters = filters }) 611 } 612 613 // Compression creates a configuration option which sets the default compression 614 // codec used by a writer for columns where none were defined. 615 func Compression(codec compress.Codec) WriterOption { 616 return writerOption(func(config *WriterConfig) { config.Compression = codec }) 617 } 618 619 // SortingWriterConfig is a writer option which applies configuration specific 620 // to sorting writers. 621 func SortingWriterConfig(options ...SortingOption) WriterOption { 622 options = append([]SortingOption{}, options...) 623 return writerOption(func(config *WriterConfig) { config.Sorting.Apply(options...) }) 624 } 625 626 // ColumnBufferCapacity creates a configuration option which defines the size of 627 // row group column buffers. 628 // 629 // Defaults to 16384. 630 func ColumnBufferCapacity(size int) RowGroupOption { 631 return rowGroupOption(func(config *RowGroupConfig) { config.ColumnBufferCapacity = size }) 632 } 633 634 // SortingRowGroupConfig is a row group option which applies configuration 635 // specific sorting row groups. 636 func SortingRowGroupConfig(options ...SortingOption) RowGroupOption { 637 options = append([]SortingOption{}, options...) 638 return rowGroupOption(func(config *RowGroupConfig) { config.Sorting.Apply(options...) }) 639 } 640 641 // SortingColumns creates a configuration option which defines the sorting order 642 // of columns in a row group. 643 // 644 // The order of sorting columns passed as argument defines the ordering 645 // hierarchy; when elements are equal in the first column, the second column is 646 // used to order rows, etc... 647 func SortingColumns(columns ...SortingColumn) SortingOption { 648 // Make a copy so that we do not retain the input slice generated implicitly 649 // for the variable argument list, and also avoid having a nil slice when 650 // the option is passed with no sorting columns, so we can differentiate it 651 // from it not being passed. 652 columns = append([]SortingColumn{}, columns...) 653 return sortingOption(func(config *SortingConfig) { config.SortingColumns = columns }) 654 } 655 656 // SortingBuffers creates a configuration option which sets the pool of buffers 657 // used to hold intermediary state when sorting parquet rows. 658 // 659 // Defaults to using in-memory buffers. 660 func SortingBuffers(buffers BufferPool) SortingOption { 661 return sortingOption(func(config *SortingConfig) { config.SortingBuffers = buffers }) 662 } 663 664 // DropDuplicatedRows configures whether a sorting writer will keep or remove 665 // duplicated rows. 666 // 667 // Two rows are considered duplicates if the values of their all their sorting 668 // columns are equal. 669 // 670 // Defaults to false 671 func DropDuplicatedRows(drop bool) SortingOption { 672 return sortingOption(func(config *SortingConfig) { config.DropDuplicatedRows = drop }) 673 } 674 675 type fileOption func(*FileConfig) 676 677 func (opt fileOption) ConfigureFile(config *FileConfig) { opt(config) } 678 679 type readerOption func(*ReaderConfig) 680 681 func (opt readerOption) ConfigureReader(config *ReaderConfig) { opt(config) } 682 683 type writerOption func(*WriterConfig) 684 685 func (opt writerOption) ConfigureWriter(config *WriterConfig) { opt(config) } 686 687 type rowGroupOption func(*RowGroupConfig) 688 689 func (opt rowGroupOption) ConfigureRowGroup(config *RowGroupConfig) { opt(config) } 690 691 type sortingOption func(*SortingConfig) 692 693 func (opt sortingOption) ConfigureSorting(config *SortingConfig) { opt(config) } 694 695 func coalesceInt(i1, i2 int) int { 696 if i1 != 0 { 697 return i1 698 } 699 return i2 700 } 701 702 func coalesceInt64(i1, i2 int64) int64 { 703 if i1 != 0 { 704 return i1 705 } 706 return i2 707 } 708 709 func coalesceString(s1, s2 string) string { 710 if s1 != "" { 711 return s1 712 } 713 return s2 714 } 715 716 func coalesceBytes(b1, b2 []byte) []byte { 717 if b1 != nil { 718 return b1 719 } 720 return b2 721 } 722 723 func coalesceBufferPool(p1, p2 BufferPool) BufferPool { 724 if p1 != nil { 725 return p1 726 } 727 return p2 728 } 729 730 func coalesceSchema(s1, s2 *Schema) *Schema { 731 if s1 != nil { 732 return s1 733 } 734 return s2 735 } 736 737 func coalesceSortingColumns(s1, s2 []SortingColumn) []SortingColumn { 738 if s1 != nil { 739 return s1 740 } 741 return s2 742 } 743 744 func coalesceSortingConfig(c1, c2 SortingConfig) SortingConfig { 745 return SortingConfig{ 746 SortingBuffers: coalesceBufferPool(c1.SortingBuffers, c2.SortingBuffers), 747 SortingColumns: coalesceSortingColumns(c1.SortingColumns, c2.SortingColumns), 748 DropDuplicatedRows: c1.DropDuplicatedRows, 749 } 750 } 751 752 func coalesceBloomFilters(f1, f2 []BloomFilterColumn) []BloomFilterColumn { 753 if f1 != nil { 754 return f1 755 } 756 return f2 757 } 758 759 func coalesceCompression(c1, c2 compress.Codec) compress.Codec { 760 if c1 != nil { 761 return c1 762 } 763 return c2 764 } 765 766 func validatePositiveInt(optionName string, optionValue int) error { 767 if optionValue > 0 { 768 return nil 769 } 770 return errorInvalidOptionValue(optionName, optionValue) 771 } 772 773 func validatePositiveInt64(optionName string, optionValue int64) error { 774 if optionValue > 0 { 775 return nil 776 } 777 return errorInvalidOptionValue(optionName, optionValue) 778 } 779 780 func validateOneOfInt(optionName string, optionValue int, supportedValues ...int) error { 781 for _, value := range supportedValues { 782 if value == optionValue { 783 return nil 784 } 785 } 786 return errorInvalidOptionValue(optionName, optionValue) 787 } 788 789 func validateNotNil(optionName string, optionValue interface{}) error { 790 if optionValue != nil { 791 return nil 792 } 793 return errorInvalidOptionValue(optionName, optionValue) 794 } 795 796 func errorInvalidOptionValue(optionName string, optionValue interface{}) error { 797 return fmt.Errorf("invalid option value: %s: %v", optionName, optionValue) 798 } 799 800 func errorInvalidConfiguration(reasons ...error) error { 801 var err *invalidConfiguration 802 803 for _, reason := range reasons { 804 if reason != nil { 805 if err == nil { 806 err = new(invalidConfiguration) 807 } 808 err.reasons = append(err.reasons, reason) 809 } 810 } 811 812 if err != nil { 813 return err 814 } 815 816 return nil 817 } 818 819 type invalidConfiguration struct { 820 reasons []error 821 } 822 823 func (err *invalidConfiguration) Error() string { 824 errorMessage := new(strings.Builder) 825 for _, reason := range err.reasons { 826 errorMessage.WriteString(reason.Error()) 827 errorMessage.WriteString("\n") 828 } 829 errorString := errorMessage.String() 830 if errorString != "" { 831 errorString = errorString[:len(errorString)-1] 832 } 833 return errorString 834 } 835 836 var ( 837 _ FileOption = (*FileConfig)(nil) 838 _ ReaderOption = (*ReaderConfig)(nil) 839 _ WriterOption = (*WriterConfig)(nil) 840 _ RowGroupOption = (*RowGroupConfig)(nil) 841 _ SortingOption = (*SortingConfig)(nil) 842 )