github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/config.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"math"
     6  	"runtime/debug"
     7  	"strings"
     8  	"sync"
     9  
    10  	"github.com/parquet-go/parquet-go/compress"
    11  )
    12  
    13  // ReadMode is an enum that is used to configure the way that a File reads pages.
    14  type ReadMode int
    15  
    16  const (
    17  	ReadModeSync  ReadMode = iota // ReadModeSync reads pages synchronously on demand (Default).
    18  	ReadModeAsync                 // ReadModeAsync reads pages asynchronously in the background.
    19  )
    20  
    21  const (
    22  	DefaultColumnIndexSizeLimit = 16
    23  	DefaultColumnBufferCapacity = 16 * 1024
    24  	DefaultPageBufferSize       = 256 * 1024
    25  	DefaultWriteBufferSize      = 32 * 1024
    26  	DefaultDataPageVersion      = 2
    27  	DefaultDataPageStatistics   = false
    28  	DefaultSkipPageIndex        = false
    29  	DefaultSkipBloomFilters     = false
    30  	DefaultMaxRowsPerRowGroup   = math.MaxInt64
    31  	DefaultReadMode             = ReadModeSync
    32  )
    33  
    34  const (
    35  	parquetGoModulePath = "github.com/parquet-go/parquet-go"
    36  )
    37  
    38  var (
    39  	defaultCreatedByInfo string
    40  	defaultCreatedByOnce sync.Once
    41  )
    42  
    43  func defaultCreatedBy() string {
    44  	defaultCreatedByOnce.Do(func() {
    45  		createdBy := parquetGoModulePath
    46  		build, ok := debug.ReadBuildInfo()
    47  		if ok {
    48  			for _, mod := range build.Deps {
    49  				if mod.Replace == nil && mod.Path == parquetGoModulePath {
    50  					semver, _, buildsha := parseModuleVersion(mod.Version)
    51  					createdBy = formatCreatedBy(createdBy, semver, buildsha)
    52  					break
    53  				}
    54  			}
    55  		}
    56  		defaultCreatedByInfo = createdBy
    57  	})
    58  	return defaultCreatedByInfo
    59  }
    60  
    61  func parseModuleVersion(version string) (semver, datetime, buildsha string) {
    62  	semver, version = splitModuleVersion(version)
    63  	datetime, version = splitModuleVersion(version)
    64  	buildsha, _ = splitModuleVersion(version)
    65  	semver = strings.TrimPrefix(semver, "v")
    66  	return
    67  }
    68  
    69  func splitModuleVersion(s string) (head, tail string) {
    70  	if i := strings.IndexByte(s, '-'); i < 0 {
    71  		head = s
    72  	} else {
    73  		head, tail = s[:i], s[i+1:]
    74  	}
    75  	return
    76  }
    77  
    78  func formatCreatedBy(application, version, build string) string {
    79  	return application + " version " + version + "(build " + build + ")"
    80  }
    81  
    82  // The FileConfig type carries configuration options for parquet files.
    83  //
    84  // FileConfig implements the FileOption interface so it can be used directly
    85  // as argument to the OpenFile function when needed, for example:
    86  //
    87  //	f, err := parquet.OpenFile(reader, size, &parquet.FileConfig{
    88  //		SkipPageIndex:    true,
    89  //		SkipBloomFilters: true,
    90  //		ReadMode:         ReadModeAsync,
    91  //	})
    92  type FileConfig struct {
    93  	SkipPageIndex    bool
    94  	SkipBloomFilters bool
    95  	ReadBufferSize   int
    96  	ReadMode         ReadMode
    97  	Schema           *Schema
    98  }
    99  
   100  // DefaultFileConfig returns a new FileConfig value initialized with the
   101  // default file configuration.
   102  func DefaultFileConfig() *FileConfig {
   103  	return &FileConfig{
   104  		SkipPageIndex:    DefaultSkipPageIndex,
   105  		SkipBloomFilters: DefaultSkipBloomFilters,
   106  		ReadBufferSize:   defaultReadBufferSize,
   107  		ReadMode:         DefaultReadMode,
   108  		Schema:           nil,
   109  	}
   110  }
   111  
   112  // NewFileConfig constructs a new file configuration applying the options passed
   113  // as arguments.
   114  //
   115  // The function returns an non-nil error if some of the options carried invalid
   116  // configuration values.
   117  func NewFileConfig(options ...FileOption) (*FileConfig, error) {
   118  	config := DefaultFileConfig()
   119  	config.Apply(options...)
   120  	return config, config.Validate()
   121  }
   122  
   123  // Apply applies the given list of options to c.
   124  func (c *FileConfig) Apply(options ...FileOption) {
   125  	for _, opt := range options {
   126  		opt.ConfigureFile(c)
   127  	}
   128  }
   129  
   130  // ConfigureFile applies configuration options from c to config.
   131  func (c *FileConfig) ConfigureFile(config *FileConfig) {
   132  	*config = FileConfig{
   133  		SkipPageIndex:    c.SkipPageIndex,
   134  		SkipBloomFilters: c.SkipBloomFilters,
   135  		ReadBufferSize:   coalesceInt(c.ReadBufferSize, config.ReadBufferSize),
   136  		ReadMode:         ReadMode(coalesceInt(int(c.ReadMode), int(config.ReadMode))),
   137  		Schema:           coalesceSchema(c.Schema, config.Schema),
   138  	}
   139  }
   140  
   141  // Validate returns a non-nil error if the configuration of c is invalid.
   142  func (c *FileConfig) Validate() error {
   143  	return nil
   144  }
   145  
   146  // The ReaderConfig type carries configuration options for parquet readers.
   147  //
   148  // ReaderConfig implements the ReaderOption interface so it can be used directly
   149  // as argument to the NewReader function when needed, for example:
   150  //
   151  //	reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{
   152  //		// ...
   153  //	})
   154  type ReaderConfig struct {
   155  	Schema *Schema
   156  }
   157  
   158  // DefaultReaderConfig returns a new ReaderConfig value initialized with the
   159  // default reader configuration.
   160  func DefaultReaderConfig() *ReaderConfig {
   161  	return &ReaderConfig{}
   162  }
   163  
   164  // NewReaderConfig constructs a new reader configuration applying the options
   165  // passed as arguments.
   166  //
   167  // The function returns an non-nil error if some of the options carried invalid
   168  // configuration values.
   169  func NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error) {
   170  	config := DefaultReaderConfig()
   171  	config.Apply(options...)
   172  	return config, config.Validate()
   173  }
   174  
   175  // Apply applies the given list of options to c.
   176  func (c *ReaderConfig) Apply(options ...ReaderOption) {
   177  	for _, opt := range options {
   178  		opt.ConfigureReader(c)
   179  	}
   180  }
   181  
   182  // ConfigureReader applies configuration options from c to config.
   183  func (c *ReaderConfig) ConfigureReader(config *ReaderConfig) {
   184  	*config = ReaderConfig{
   185  		Schema: coalesceSchema(c.Schema, config.Schema),
   186  	}
   187  }
   188  
   189  // Validate returns a non-nil error if the configuration of c is invalid.
   190  func (c *ReaderConfig) Validate() error {
   191  	return nil
   192  }
   193  
   194  // The WriterConfig type carries configuration options for parquet writers.
   195  //
   196  // WriterConfig implements the WriterOption interface so it can be used directly
   197  // as argument to the NewWriter function when needed, for example:
   198  //
   199  //	writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{
   200  //		CreatedBy: "my test program",
   201  //	})
   202  type WriterConfig struct {
   203  	CreatedBy            string
   204  	ColumnPageBuffers    BufferPool
   205  	ColumnIndexSizeLimit int
   206  	PageBufferSize       int
   207  	WriteBufferSize      int
   208  	DataPageVersion      int
   209  	DataPageStatistics   bool
   210  	MaxRowsPerRowGroup   int64
   211  	KeyValueMetadata     map[string]string
   212  	Schema               *Schema
   213  	BloomFilters         []BloomFilterColumn
   214  	Compression          compress.Codec
   215  	Sorting              SortingConfig
   216  }
   217  
   218  // DefaultWriterConfig returns a new WriterConfig value initialized with the
   219  // default writer configuration.
   220  func DefaultWriterConfig() *WriterConfig {
   221  	return &WriterConfig{
   222  		CreatedBy:            defaultCreatedBy(),
   223  		ColumnPageBuffers:    &defaultColumnBufferPool,
   224  		ColumnIndexSizeLimit: DefaultColumnIndexSizeLimit,
   225  		PageBufferSize:       DefaultPageBufferSize,
   226  		WriteBufferSize:      DefaultWriteBufferSize,
   227  		DataPageVersion:      DefaultDataPageVersion,
   228  		DataPageStatistics:   DefaultDataPageStatistics,
   229  		MaxRowsPerRowGroup:   DefaultMaxRowsPerRowGroup,
   230  		Sorting: SortingConfig{
   231  			SortingBuffers: &defaultSortingBufferPool,
   232  		},
   233  	}
   234  }
   235  
   236  // NewWriterConfig constructs a new writer configuration applying the options
   237  // passed as arguments.
   238  //
   239  // The function returns an non-nil error if some of the options carried invalid
   240  // configuration values.
   241  func NewWriterConfig(options ...WriterOption) (*WriterConfig, error) {
   242  	config := DefaultWriterConfig()
   243  	config.Apply(options...)
   244  	return config, config.Validate()
   245  }
   246  
   247  // Apply applies the given list of options to c.
   248  func (c *WriterConfig) Apply(options ...WriterOption) {
   249  	for _, opt := range options {
   250  		opt.ConfigureWriter(c)
   251  	}
   252  }
   253  
   254  // ConfigureWriter applies configuration options from c to config.
   255  func (c *WriterConfig) ConfigureWriter(config *WriterConfig) {
   256  	keyValueMetadata := config.KeyValueMetadata
   257  	if len(c.KeyValueMetadata) > 0 {
   258  		if keyValueMetadata == nil {
   259  			keyValueMetadata = make(map[string]string, len(c.KeyValueMetadata))
   260  		}
   261  		for k, v := range c.KeyValueMetadata {
   262  			keyValueMetadata[k] = v
   263  		}
   264  	}
   265  
   266  	*config = WriterConfig{
   267  		CreatedBy:            coalesceString(c.CreatedBy, config.CreatedBy),
   268  		ColumnPageBuffers:    coalesceBufferPool(c.ColumnPageBuffers, config.ColumnPageBuffers),
   269  		ColumnIndexSizeLimit: coalesceInt(c.ColumnIndexSizeLimit, config.ColumnIndexSizeLimit),
   270  		PageBufferSize:       coalesceInt(c.PageBufferSize, config.PageBufferSize),
   271  		WriteBufferSize:      coalesceInt(c.WriteBufferSize, config.WriteBufferSize),
   272  		DataPageVersion:      coalesceInt(c.DataPageVersion, config.DataPageVersion),
   273  		DataPageStatistics:   config.DataPageStatistics,
   274  		MaxRowsPerRowGroup:   config.MaxRowsPerRowGroup,
   275  		KeyValueMetadata:     keyValueMetadata,
   276  		Schema:               coalesceSchema(c.Schema, config.Schema),
   277  		BloomFilters:         coalesceBloomFilters(c.BloomFilters, config.BloomFilters),
   278  		Compression:          coalesceCompression(c.Compression, config.Compression),
   279  		Sorting:              coalesceSortingConfig(c.Sorting, config.Sorting),
   280  	}
   281  }
   282  
   283  // Validate returns a non-nil error if the configuration of c is invalid.
   284  func (c *WriterConfig) Validate() error {
   285  	const baseName = "parquet.(*WriterConfig)."
   286  	return errorInvalidConfiguration(
   287  		validateNotNil(baseName+"ColumnPageBuffers", c.ColumnPageBuffers),
   288  		validatePositiveInt(baseName+"ColumnIndexSizeLimit", c.ColumnIndexSizeLimit),
   289  		validatePositiveInt(baseName+"PageBufferSize", c.PageBufferSize),
   290  		validateOneOfInt(baseName+"DataPageVersion", c.DataPageVersion, 1, 2),
   291  		c.Sorting.Validate(),
   292  	)
   293  }
   294  
   295  // The RowGroupConfig type carries configuration options for parquet row groups.
   296  //
   297  // RowGroupConfig implements the RowGroupOption interface so it can be used
   298  // directly as argument to the NewBuffer function when needed, for example:
   299  //
   300  //	buffer := parquet.NewBuffer(&parquet.RowGroupConfig{
   301  //		ColumnBufferCapacity: 10_000,
   302  //	})
   303  type RowGroupConfig struct {
   304  	ColumnBufferCapacity int
   305  	Schema               *Schema
   306  	Sorting              SortingConfig
   307  }
   308  
   309  // DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the
   310  // default row group configuration.
   311  func DefaultRowGroupConfig() *RowGroupConfig {
   312  	return &RowGroupConfig{
   313  		ColumnBufferCapacity: DefaultColumnBufferCapacity,
   314  		Sorting: SortingConfig{
   315  			SortingBuffers: &defaultSortingBufferPool,
   316  		},
   317  	}
   318  }
   319  
   320  // NewRowGroupConfig constructs a new row group configuration applying the
   321  // options passed as arguments.
   322  //
   323  // The function returns an non-nil error if some of the options carried invalid
   324  // configuration values.
   325  func NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error) {
   326  	config := DefaultRowGroupConfig()
   327  	config.Apply(options...)
   328  	return config, config.Validate()
   329  }
   330  
   331  // Validate returns a non-nil error if the configuration of c is invalid.
   332  func (c *RowGroupConfig) Validate() error {
   333  	const baseName = "parquet.(*RowGroupConfig)."
   334  	return errorInvalidConfiguration(
   335  		validatePositiveInt(baseName+"ColumnBufferCapacity", c.ColumnBufferCapacity),
   336  		c.Sorting.Validate(),
   337  	)
   338  }
   339  
   340  func (c *RowGroupConfig) Apply(options ...RowGroupOption) {
   341  	for _, opt := range options {
   342  		opt.ConfigureRowGroup(c)
   343  	}
   344  }
   345  
   346  func (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig) {
   347  	*config = RowGroupConfig{
   348  		ColumnBufferCapacity: coalesceInt(c.ColumnBufferCapacity, config.ColumnBufferCapacity),
   349  		Schema:               coalesceSchema(c.Schema, config.Schema),
   350  		Sorting:              coalesceSortingConfig(c.Sorting, config.Sorting),
   351  	}
   352  }
   353  
   354  // The SortingConfig type carries configuration options for parquet row groups.
   355  //
   356  // SortingConfig implements the SortingOption interface so it can be used
   357  // directly as argument to the NewSortingWriter function when needed,
   358  // for example:
   359  //
   360  //	buffer := parquet.NewSortingWriter[Row](
   361  //		parquet.SortingWriterConfig(
   362  //			parquet.DropDuplicatedRows(true),
   363  //		),
   364  //	})
   365  type SortingConfig struct {
   366  	SortingBuffers     BufferPool
   367  	SortingColumns     []SortingColumn
   368  	DropDuplicatedRows bool
   369  }
   370  
   371  // DefaultSortingConfig returns a new SortingConfig value initialized with the
   372  // default row group configuration.
   373  func DefaultSortingConfig() *SortingConfig {
   374  	return &SortingConfig{
   375  		SortingBuffers: &defaultSortingBufferPool,
   376  	}
   377  }
   378  
   379  // NewSortingConfig constructs a new sorting configuration applying the
   380  // options passed as arguments.
   381  //
   382  // The function returns an non-nil error if some of the options carried invalid
   383  // configuration values.
   384  func NewSortingConfig(options ...SortingOption) (*SortingConfig, error) {
   385  	config := DefaultSortingConfig()
   386  	config.Apply(options...)
   387  	return config, config.Validate()
   388  }
   389  
   390  func (c *SortingConfig) Validate() error {
   391  	const baseName = "parquet.(*SortingConfig)."
   392  	return errorInvalidConfiguration(
   393  		validateNotNil(baseName+"SortingBuffers", c.SortingBuffers),
   394  	)
   395  }
   396  
   397  func (c *SortingConfig) Apply(options ...SortingOption) {
   398  	for _, opt := range options {
   399  		opt.ConfigureSorting(c)
   400  	}
   401  }
   402  
   403  func (c *SortingConfig) ConfigureSorting(config *SortingConfig) {
   404  	*config = coalesceSortingConfig(*c, *config)
   405  }
   406  
   407  // FileOption is an interface implemented by types that carry configuration
   408  // options for parquet files.
   409  type FileOption interface {
   410  	ConfigureFile(*FileConfig)
   411  }
   412  
   413  // ReaderOption is an interface implemented by types that carry configuration
   414  // options for parquet readers.
   415  type ReaderOption interface {
   416  	ConfigureReader(*ReaderConfig)
   417  }
   418  
   419  // WriterOption is an interface implemented by types that carry configuration
   420  // options for parquet writers.
   421  type WriterOption interface {
   422  	ConfigureWriter(*WriterConfig)
   423  }
   424  
   425  // RowGroupOption is an interface implemented by types that carry configuration
   426  // options for parquet row groups.
   427  type RowGroupOption interface {
   428  	ConfigureRowGroup(*RowGroupConfig)
   429  }
   430  
   431  // SortingOption is an interface implemented by types that carry configuration
   432  // options for parquet sorting writers.
   433  type SortingOption interface {
   434  	ConfigureSorting(*SortingConfig)
   435  }
   436  
   437  // SkipPageIndex is a file configuration option which prevents automatically
   438  // reading the page index when opening a parquet file, when set to true. This is
   439  // useful as an optimization when programs know that they will not need to
   440  // consume the page index.
   441  //
   442  // Defaults to false.
   443  func SkipPageIndex(skip bool) FileOption {
   444  	return fileOption(func(config *FileConfig) { config.SkipPageIndex = skip })
   445  }
   446  
   447  // SkipBloomFilters is a file configuration option which prevents automatically
   448  // reading the bloom filters when opening a parquet file, when set to true.
   449  // This is useful as an optimization when programs know that they will not need
   450  // to consume the bloom filters.
   451  //
   452  // Defaults to false.
   453  func SkipBloomFilters(skip bool) FileOption {
   454  	return fileOption(func(config *FileConfig) { config.SkipBloomFilters = skip })
   455  }
   456  
   457  // FileReadMode is a file configuration option which controls the way pages
   458  // are read. Currently the only two options are ReadModeAsync and ReadModeSync
   459  // which control whether or not pages are loaded asynchronously. It can be
   460  // advantageous to use ReadModeAsync if your reader is backed by network
   461  // storage.
   462  //
   463  // Defaults to ReadModeSync.
   464  func FileReadMode(mode ReadMode) FileOption {
   465  	return fileOption(func(config *FileConfig) { config.ReadMode = mode })
   466  }
   467  
   468  // ReadBufferSize is a file configuration option which controls the default
   469  // buffer sizes for reads made to the provided io.Reader. The default of 4096
   470  // is appropriate for disk based access but if your reader is backed by network
   471  // storage it can be advantageous to increase this value to something more like
   472  // 4 MiB.
   473  //
   474  // Defaults to 4096.
   475  func ReadBufferSize(size int) FileOption {
   476  	return fileOption(func(config *FileConfig) { config.ReadBufferSize = size })
   477  }
   478  
   479  // FileSchema is used to pass a known schema in while opening a Parquet file.
   480  // This optimization is only useful if your application is currently opening
   481  // an extremely large number of parquet files with the same, known schema.
   482  //
   483  // Defaults to nil.
   484  func FileSchema(schema *Schema) FileOption {
   485  	return fileOption(func(config *FileConfig) { config.Schema = schema })
   486  }
   487  
   488  // PageBufferSize configures the size of column page buffers on parquet writers.
   489  //
   490  // Note that the page buffer size refers to the in-memory buffers where pages
   491  // are generated, not the size of pages after encoding and compression.
   492  // This design choice was made to help control the amount of memory needed to
   493  // read and write pages rather than controlling the space used by the encoded
   494  // representation on disk.
   495  //
   496  // Defaults to 256KiB.
   497  func PageBufferSize(size int) WriterOption {
   498  	return writerOption(func(config *WriterConfig) { config.PageBufferSize = size })
   499  }
   500  
   501  // WriteBufferSize configures the size of the write buffer.
   502  //
   503  // Setting the writer buffer size to zero deactivates buffering, all writes are
   504  // immediately sent to the output io.Writer.
   505  //
   506  // Defaults to 32KiB.
   507  func WriteBufferSize(size int) WriterOption {
   508  	return writerOption(func(config *WriterConfig) { config.WriteBufferSize = size })
   509  }
   510  
   511  // MaxRowsPerRowGroup configures the maximum number of rows that a writer will
   512  // produce in each row group.
   513  //
   514  // This limit is useful to control size of row groups in both number of rows and
   515  // byte size. While controlling the byte size of a row group is difficult to
   516  // achieve with parquet due to column encoding and compression, the number of
   517  // rows remains a useful proxy.
   518  //
   519  // Defaults to unlimited.
   520  func MaxRowsPerRowGroup(numRows int64) WriterOption {
   521  	if numRows <= 0 {
   522  		numRows = DefaultMaxRowsPerRowGroup
   523  	}
   524  	return writerOption(func(config *WriterConfig) { config.MaxRowsPerRowGroup = numRows })
   525  }
   526  
   527  // CreatedBy creates a configuration option which sets the name of the
   528  // application that created a parquet file.
   529  //
   530  // The option formats the "CreatedBy" file metadata according to the convention
   531  // described by the parquet spec:
   532  //
   533  //	"<application> version <version> (build <build>)"
   534  //
   535  // By default, the option is set to the parquet-go module name, version, and
   536  // build hash.
   537  func CreatedBy(application, version, build string) WriterOption {
   538  	createdBy := formatCreatedBy(application, version, build)
   539  	return writerOption(func(config *WriterConfig) { config.CreatedBy = createdBy })
   540  }
   541  
   542  // ColumnPageBuffers creates a configuration option to customize the buffer pool
   543  // used when constructing row groups. This can be used to provide on-disk buffers
   544  // as swap space to ensure that the parquet file creation will no be bottlenecked
   545  // on the amount of memory available.
   546  //
   547  // Defaults to using in-memory buffers.
   548  func ColumnPageBuffers(buffers BufferPool) WriterOption {
   549  	return writerOption(func(config *WriterConfig) { config.ColumnPageBuffers = buffers })
   550  }
   551  
   552  // ColumnIndexSizeLimit creates a configuration option to customize the size
   553  // limit of page boundaries recorded in column indexes.
   554  //
   555  // Defaults to 16.
   556  func ColumnIndexSizeLimit(sizeLimit int) WriterOption {
   557  	return writerOption(func(config *WriterConfig) { config.ColumnIndexSizeLimit = sizeLimit })
   558  }
   559  
   560  // DataPageVersion creates a configuration option which configures the version of
   561  // data pages used when creating a parquet file.
   562  //
   563  // Defaults to version 2.
   564  func DataPageVersion(version int) WriterOption {
   565  	return writerOption(func(config *WriterConfig) { config.DataPageVersion = version })
   566  }
   567  
   568  // DataPageStatistics creates a configuration option which defines whether data
   569  // page statistics are emitted. This option is useful when generating parquet
   570  // files that intend to be backward compatible with older readers which may not
   571  // have the ability to load page statistics from the column index.
   572  //
   573  // Defaults to false.
   574  func DataPageStatistics(enabled bool) WriterOption {
   575  	return writerOption(func(config *WriterConfig) { config.DataPageStatistics = enabled })
   576  }
   577  
   578  // KeyValueMetadata creates a configuration option which adds key/value metadata
   579  // to add to the metadata of parquet files.
   580  //
   581  // This option is additive, it may be used multiple times to add more than one
   582  // key/value pair.
   583  //
   584  // Keys are assumed to be unique, if the same key is repeated multiple times the
   585  // last value is retained. While the parquet format does not require unique keys,
   586  // this design decision was made to optimize for the most common use case where
   587  // applications leverage this extension mechanism to associate single values to
   588  // keys. This may create incompatibilities with other parquet libraries, or may
   589  // cause some key/value pairs to be lost when open parquet files written with
   590  // repeated keys. We can revisit this decision if it ever becomes a blocker.
   591  func KeyValueMetadata(key, value string) WriterOption {
   592  	return writerOption(func(config *WriterConfig) {
   593  		if config.KeyValueMetadata == nil {
   594  			config.KeyValueMetadata = map[string]string{key: value}
   595  		} else {
   596  			config.KeyValueMetadata[key] = value
   597  		}
   598  	})
   599  }
   600  
   601  // BloomFilters creates a configuration option which defines the bloom filters
   602  // that parquet writers should generate.
   603  //
   604  // The compute and memory footprint of generating bloom filters for all columns
   605  // of a parquet schema can be significant, so by default no filters are created
   606  // and applications need to explicitly declare the columns that they want to
   607  // create filters for.
   608  func BloomFilters(filters ...BloomFilterColumn) WriterOption {
   609  	filters = append([]BloomFilterColumn{}, filters...)
   610  	return writerOption(func(config *WriterConfig) { config.BloomFilters = filters })
   611  }
   612  
   613  // Compression creates a configuration option which sets the default compression
   614  // codec used by a writer for columns where none were defined.
   615  func Compression(codec compress.Codec) WriterOption {
   616  	return writerOption(func(config *WriterConfig) { config.Compression = codec })
   617  }
   618  
   619  // SortingWriterConfig is a writer option which applies configuration specific
   620  // to sorting writers.
   621  func SortingWriterConfig(options ...SortingOption) WriterOption {
   622  	options = append([]SortingOption{}, options...)
   623  	return writerOption(func(config *WriterConfig) { config.Sorting.Apply(options...) })
   624  }
   625  
   626  // ColumnBufferCapacity creates a configuration option which defines the size of
   627  // row group column buffers.
   628  //
   629  // Defaults to 16384.
   630  func ColumnBufferCapacity(size int) RowGroupOption {
   631  	return rowGroupOption(func(config *RowGroupConfig) { config.ColumnBufferCapacity = size })
   632  }
   633  
   634  // SortingRowGroupConfig is a row group option which applies configuration
   635  // specific sorting row groups.
   636  func SortingRowGroupConfig(options ...SortingOption) RowGroupOption {
   637  	options = append([]SortingOption{}, options...)
   638  	return rowGroupOption(func(config *RowGroupConfig) { config.Sorting.Apply(options...) })
   639  }
   640  
   641  // SortingColumns creates a configuration option which defines the sorting order
   642  // of columns in a row group.
   643  //
   644  // The order of sorting columns passed as argument defines the ordering
   645  // hierarchy; when elements are equal in the first column, the second column is
   646  // used to order rows, etc...
   647  func SortingColumns(columns ...SortingColumn) SortingOption {
   648  	// Make a copy so that we do not retain the input slice generated implicitly
   649  	// for the variable argument list, and also avoid having a nil slice when
   650  	// the option is passed with no sorting columns, so we can differentiate it
   651  	// from it not being passed.
   652  	columns = append([]SortingColumn{}, columns...)
   653  	return sortingOption(func(config *SortingConfig) { config.SortingColumns = columns })
   654  }
   655  
   656  // SortingBuffers creates a configuration option which sets the pool of buffers
   657  // used to hold intermediary state when sorting parquet rows.
   658  //
   659  // Defaults to using in-memory buffers.
   660  func SortingBuffers(buffers BufferPool) SortingOption {
   661  	return sortingOption(func(config *SortingConfig) { config.SortingBuffers = buffers })
   662  }
   663  
   664  // DropDuplicatedRows configures whether a sorting writer will keep or remove
   665  // duplicated rows.
   666  //
   667  // Two rows are considered duplicates if the values of their all their sorting
   668  // columns are equal.
   669  //
   670  // Defaults to false
   671  func DropDuplicatedRows(drop bool) SortingOption {
   672  	return sortingOption(func(config *SortingConfig) { config.DropDuplicatedRows = drop })
   673  }
   674  
   675  type fileOption func(*FileConfig)
   676  
   677  func (opt fileOption) ConfigureFile(config *FileConfig) { opt(config) }
   678  
   679  type readerOption func(*ReaderConfig)
   680  
   681  func (opt readerOption) ConfigureReader(config *ReaderConfig) { opt(config) }
   682  
   683  type writerOption func(*WriterConfig)
   684  
   685  func (opt writerOption) ConfigureWriter(config *WriterConfig) { opt(config) }
   686  
   687  type rowGroupOption func(*RowGroupConfig)
   688  
   689  func (opt rowGroupOption) ConfigureRowGroup(config *RowGroupConfig) { opt(config) }
   690  
   691  type sortingOption func(*SortingConfig)
   692  
   693  func (opt sortingOption) ConfigureSorting(config *SortingConfig) { opt(config) }
   694  
   695  func coalesceInt(i1, i2 int) int {
   696  	if i1 != 0 {
   697  		return i1
   698  	}
   699  	return i2
   700  }
   701  
   702  func coalesceInt64(i1, i2 int64) int64 {
   703  	if i1 != 0 {
   704  		return i1
   705  	}
   706  	return i2
   707  }
   708  
   709  func coalesceString(s1, s2 string) string {
   710  	if s1 != "" {
   711  		return s1
   712  	}
   713  	return s2
   714  }
   715  
   716  func coalesceBytes(b1, b2 []byte) []byte {
   717  	if b1 != nil {
   718  		return b1
   719  	}
   720  	return b2
   721  }
   722  
   723  func coalesceBufferPool(p1, p2 BufferPool) BufferPool {
   724  	if p1 != nil {
   725  		return p1
   726  	}
   727  	return p2
   728  }
   729  
   730  func coalesceSchema(s1, s2 *Schema) *Schema {
   731  	if s1 != nil {
   732  		return s1
   733  	}
   734  	return s2
   735  }
   736  
   737  func coalesceSortingColumns(s1, s2 []SortingColumn) []SortingColumn {
   738  	if s1 != nil {
   739  		return s1
   740  	}
   741  	return s2
   742  }
   743  
   744  func coalesceSortingConfig(c1, c2 SortingConfig) SortingConfig {
   745  	return SortingConfig{
   746  		SortingBuffers:     coalesceBufferPool(c1.SortingBuffers, c2.SortingBuffers),
   747  		SortingColumns:     coalesceSortingColumns(c1.SortingColumns, c2.SortingColumns),
   748  		DropDuplicatedRows: c1.DropDuplicatedRows,
   749  	}
   750  }
   751  
   752  func coalesceBloomFilters(f1, f2 []BloomFilterColumn) []BloomFilterColumn {
   753  	if f1 != nil {
   754  		return f1
   755  	}
   756  	return f2
   757  }
   758  
   759  func coalesceCompression(c1, c2 compress.Codec) compress.Codec {
   760  	if c1 != nil {
   761  		return c1
   762  	}
   763  	return c2
   764  }
   765  
   766  func validatePositiveInt(optionName string, optionValue int) error {
   767  	if optionValue > 0 {
   768  		return nil
   769  	}
   770  	return errorInvalidOptionValue(optionName, optionValue)
   771  }
   772  
   773  func validatePositiveInt64(optionName string, optionValue int64) error {
   774  	if optionValue > 0 {
   775  		return nil
   776  	}
   777  	return errorInvalidOptionValue(optionName, optionValue)
   778  }
   779  
   780  func validateOneOfInt(optionName string, optionValue int, supportedValues ...int) error {
   781  	for _, value := range supportedValues {
   782  		if value == optionValue {
   783  			return nil
   784  		}
   785  	}
   786  	return errorInvalidOptionValue(optionName, optionValue)
   787  }
   788  
   789  func validateNotNil(optionName string, optionValue interface{}) error {
   790  	if optionValue != nil {
   791  		return nil
   792  	}
   793  	return errorInvalidOptionValue(optionName, optionValue)
   794  }
   795  
   796  func errorInvalidOptionValue(optionName string, optionValue interface{}) error {
   797  	return fmt.Errorf("invalid option value: %s: %v", optionName, optionValue)
   798  }
   799  
   800  func errorInvalidConfiguration(reasons ...error) error {
   801  	var err *invalidConfiguration
   802  
   803  	for _, reason := range reasons {
   804  		if reason != nil {
   805  			if err == nil {
   806  				err = new(invalidConfiguration)
   807  			}
   808  			err.reasons = append(err.reasons, reason)
   809  		}
   810  	}
   811  
   812  	if err != nil {
   813  		return err
   814  	}
   815  
   816  	return nil
   817  }
   818  
   819  type invalidConfiguration struct {
   820  	reasons []error
   821  }
   822  
   823  func (err *invalidConfiguration) Error() string {
   824  	errorMessage := new(strings.Builder)
   825  	for _, reason := range err.reasons {
   826  		errorMessage.WriteString(reason.Error())
   827  		errorMessage.WriteString("\n")
   828  	}
   829  	errorString := errorMessage.String()
   830  	if errorString != "" {
   831  		errorString = errorString[:len(errorString)-1]
   832  	}
   833  	return errorString
   834  }
   835  
   836  var (
   837  	_ FileOption     = (*FileConfig)(nil)
   838  	_ ReaderOption   = (*ReaderConfig)(nil)
   839  	_ WriterOption   = (*WriterConfig)(nil)
   840  	_ RowGroupOption = (*RowGroupConfig)(nil)
   841  	_ SortingOption  = (*SortingConfig)(nil)
   842  )