github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/config.go (about)

     1  package parquet
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  
     7  	"github.com/vc42/parquet-go/compress"
     8  )
     9  
    10  const (
    11  	DefaultCreatedBy            = "github.com/vc42/parquet-go"
    12  	DefaultColumnIndexSizeLimit = 16
    13  	DefaultColumnBufferCapacity = 16 * 1024
    14  	DefaultPageBufferSize       = 256 * 1024
    15  	DefaultWriteBufferSize      = 32 * 1024
    16  	DefaultDataPageVersion      = 2
    17  	DefaultDataPageStatistics   = false
    18  	DefaultSkipPageIndex        = false
    19  	DefaultSkipBloomFilters     = false
    20  )
    21  
    22  // The FileConfig type carries configuration options for parquet files.
    23  //
    24  // FileConfig implements the FileOption interface so it can be used directly
    25  // as argument to the OpenFile function when needed, for example:
    26  //
    27  //	f, err := parquet.OpenFile(reader, size, &parquet.FileConfig{
    28  //		SkipPageIndex:    true,
    29  //		SkipBloomFilters: true,
    30  //	})
    31  //
    32  type FileConfig struct {
    33  	SkipPageIndex    bool
    34  	SkipBloomFilters bool
    35  }
    36  
    37  // DefaultFileConfig returns a new FileConfig value initialized with the
    38  // default file configuration.
    39  func DefaultFileConfig() *FileConfig {
    40  	return &FileConfig{
    41  		SkipPageIndex:    DefaultSkipPageIndex,
    42  		SkipBloomFilters: DefaultSkipBloomFilters,
    43  	}
    44  }
    45  
    46  // NewFileConfig constructs a new file configuration applying the options passed
    47  // as arguments.
    48  //
    49  // The function returns an non-nil error if some of the options carried invalid
    50  // configuration values.
    51  func NewFileConfig(options ...FileOption) (*FileConfig, error) {
    52  	config := DefaultFileConfig()
    53  	config.Apply(options...)
    54  	return config, config.Validate()
    55  }
    56  
    57  // Apply applies the given list of options to c.
    58  func (c *FileConfig) Apply(options ...FileOption) {
    59  	for _, opt := range options {
    60  		opt.ConfigureFile(c)
    61  	}
    62  }
    63  
    64  // ConfigureFile applies configuration options from c to config.
    65  func (c *FileConfig) ConfigureFile(config *FileConfig) {
    66  	*config = FileConfig{
    67  		SkipPageIndex:    config.SkipPageIndex,
    68  		SkipBloomFilters: config.SkipBloomFilters,
    69  	}
    70  }
    71  
    72  // Validate returns a non-nil error if the configuration of c is invalid.
    73  func (c *FileConfig) Validate() error {
    74  	return nil
    75  }
    76  
    77  // The ReaderConfig type carries configuration options for parquet readers.
    78  //
    79  // ReaderConfig implements the ReaderOption interface so it can be used directly
    80  // as argument to the NewReader function when needed, for example:
    81  //
    82  //	reader := parquet.NewReader(output, schema, &parquet.ReaderConfig{
    83  //		// ...
    84  //	})
    85  //
    86  type ReaderConfig struct {
    87  	Schema *Schema
    88  }
    89  
    90  // DefaultReaderConfig returns a new ReaderConfig value initialized with the
    91  // default reader configuration.
    92  func DefaultReaderConfig() *ReaderConfig {
    93  	return &ReaderConfig{}
    94  }
    95  
    96  // NewReaderConfig constructs a new reader configuration applying the options
    97  // passed as arguments.
    98  //
    99  // The function returns an non-nil error if some of the options carried invalid
   100  // configuration values.
   101  func NewReaderConfig(options ...ReaderOption) (*ReaderConfig, error) {
   102  	config := DefaultReaderConfig()
   103  	config.Apply(options...)
   104  	return config, config.Validate()
   105  }
   106  
   107  // Apply applies the given list of options to c.
   108  func (c *ReaderConfig) Apply(options ...ReaderOption) {
   109  	for _, opt := range options {
   110  		opt.ConfigureReader(c)
   111  	}
   112  }
   113  
   114  // ConfigureReader applies configuration options from c to config.
   115  func (c *ReaderConfig) ConfigureReader(config *ReaderConfig) {
   116  	*config = ReaderConfig{
   117  		Schema: coalesceSchema(c.Schema, config.Schema),
   118  	}
   119  }
   120  
   121  // Validate returns a non-nil error if the configuration of c is invalid.
   122  func (c *ReaderConfig) Validate() error {
   123  	return nil
   124  }
   125  
   126  // The WriterConfig type carries configuration options for parquet writers.
   127  //
   128  // WriterConfig implements the WriterOption interface so it can be used directly
   129  // as argument to the NewWriter function when needed, for example:
   130  //
   131  //	writer := parquet.NewWriter(output, schema, &parquet.WriterConfig{
   132  //		CreatedBy: "my test program",
   133  //	})
   134  //
   135  type WriterConfig struct {
   136  	CreatedBy            string
   137  	ColumnPageBuffers    PageBufferPool
   138  	ColumnIndexSizeLimit int
   139  	PageBufferSize       int
   140  	WriteBufferSize      int
   141  	DataPageVersion      int
   142  	DataPageStatistics   bool
   143  	KeyValueMetadata     map[string]string
   144  	Schema               *Schema
   145  	SortingColumns       []SortingColumn
   146  	BloomFilters         []BloomFilterColumn
   147  	Compression          compress.Codec
   148  }
   149  
   150  // DefaultWriterConfig returns a new WriterConfig value initialized with the
   151  // default writer configuration.
   152  func DefaultWriterConfig() *WriterConfig {
   153  	return &WriterConfig{
   154  		CreatedBy:            DefaultCreatedBy,
   155  		ColumnPageBuffers:    &defaultPageBufferPool,
   156  		ColumnIndexSizeLimit: DefaultColumnIndexSizeLimit,
   157  		PageBufferSize:       DefaultPageBufferSize,
   158  		WriteBufferSize:      DefaultWriteBufferSize,
   159  		DataPageVersion:      DefaultDataPageVersion,
   160  		DataPageStatistics:   DefaultDataPageStatistics,
   161  	}
   162  }
   163  
   164  // NewWriterConfig constructs a new writer configuration applying the options
   165  // passed as arguments.
   166  //
   167  // The function returns an non-nil error if some of the options carried invalid
   168  // configuration values.
   169  func NewWriterConfig(options ...WriterOption) (*WriterConfig, error) {
   170  	config := DefaultWriterConfig()
   171  	config.Apply(options...)
   172  	return config, config.Validate()
   173  }
   174  
   175  // Apply applies the given list of options to c.
   176  func (c *WriterConfig) Apply(options ...WriterOption) {
   177  	for _, opt := range options {
   178  		opt.ConfigureWriter(c)
   179  	}
   180  }
   181  
   182  // ConfigureWriter applies configuration options from c to config.
   183  func (c *WriterConfig) ConfigureWriter(config *WriterConfig) {
   184  	keyValueMetadata := config.KeyValueMetadata
   185  	if len(c.KeyValueMetadata) > 0 {
   186  		if keyValueMetadata == nil {
   187  			keyValueMetadata = make(map[string]string, len(c.KeyValueMetadata))
   188  		}
   189  		for k, v := range c.KeyValueMetadata {
   190  			keyValueMetadata[k] = v
   191  		}
   192  	}
   193  	*config = WriterConfig{
   194  		CreatedBy:            coalesceString(c.CreatedBy, config.CreatedBy),
   195  		ColumnPageBuffers:    coalescePageBufferPool(c.ColumnPageBuffers, config.ColumnPageBuffers),
   196  		ColumnIndexSizeLimit: coalesceInt(c.ColumnIndexSizeLimit, config.ColumnIndexSizeLimit),
   197  		PageBufferSize:       coalesceInt(c.PageBufferSize, config.PageBufferSize),
   198  		WriteBufferSize:      coalesceInt(c.WriteBufferSize, config.WriteBufferSize),
   199  		DataPageVersion:      coalesceInt(c.DataPageVersion, config.DataPageVersion),
   200  		DataPageStatistics:   config.DataPageStatistics,
   201  		KeyValueMetadata:     keyValueMetadata,
   202  		Schema:               coalesceSchema(c.Schema, config.Schema),
   203  		SortingColumns:       coalesceSortingColumns(c.SortingColumns, config.SortingColumns),
   204  		BloomFilters:         coalesceBloomFilters(c.BloomFilters, config.BloomFilters),
   205  		Compression:          coalesceCompression(c.Compression, config.Compression),
   206  	}
   207  }
   208  
   209  // Validate returns a non-nil error if the configuration of c is invalid.
   210  func (c *WriterConfig) Validate() error {
   211  	const baseName = "parquet.(*WriterConfig)."
   212  	return errorInvalidConfiguration(
   213  		validateNotNil(baseName+"ColumnPageBuffers", c.ColumnPageBuffers),
   214  		validatePositiveInt(baseName+"ColumnIndexSizeLimit", c.ColumnIndexSizeLimit),
   215  		validatePositiveInt(baseName+"PageBufferSize", c.PageBufferSize),
   216  		validateOneOfInt(baseName+"DataPageVersion", c.DataPageVersion, 1, 2),
   217  	)
   218  }
   219  
   220  // The RowGroupConfig type carries configuration options for parquet row groups.
   221  //
   222  // RowGroupConfig implements the RowGroupOption interface so it can be used
   223  // directly as argument to the NewBuffer function when needed, for example:
   224  //
   225  //	buffer := parquet.NewBuffer(&parquet.RowGroupConfig{
   226  //		ColumnBufferCapacity: 10_000,
   227  //	})
   228  //
   229  type RowGroupConfig struct {
   230  	ColumnBufferCapacity int
   231  	SortingColumns       []SortingColumn
   232  	Schema               *Schema
   233  }
   234  
   235  // DefaultRowGroupConfig returns a new RowGroupConfig value initialized with the
   236  // default row group configuration.
   237  func DefaultRowGroupConfig() *RowGroupConfig {
   238  	return &RowGroupConfig{
   239  		ColumnBufferCapacity: DefaultColumnBufferCapacity,
   240  	}
   241  }
   242  
   243  // NewRowGroupConfig constructs a new row group configuration applying the
   244  // options passed as arguments.
   245  //
   246  // The function returns an non-nil error if some of the options carried invalid
   247  // configuration values.
   248  func NewRowGroupConfig(options ...RowGroupOption) (*RowGroupConfig, error) {
   249  	config := DefaultRowGroupConfig()
   250  	config.Apply(options...)
   251  	return config, config.Validate()
   252  }
   253  
   254  // Validate returns a non-nil error if the configuration of c is invalid.
   255  func (c *RowGroupConfig) Validate() error {
   256  	const baseName = "parquet.(*RowGroupConfig)."
   257  	return errorInvalidConfiguration(
   258  		validatePositiveInt(baseName+"ColumnBufferCapacity", c.ColumnBufferCapacity),
   259  	)
   260  }
   261  
   262  func (c *RowGroupConfig) Apply(options ...RowGroupOption) {
   263  	for _, opt := range options {
   264  		opt.ConfigureRowGroup(c)
   265  	}
   266  }
   267  
   268  func (c *RowGroupConfig) ConfigureRowGroup(config *RowGroupConfig) {
   269  	*config = RowGroupConfig{
   270  		ColumnBufferCapacity: coalesceInt(c.ColumnBufferCapacity, config.ColumnBufferCapacity),
   271  		SortingColumns:       coalesceSortingColumns(c.SortingColumns, config.SortingColumns),
   272  		Schema:               coalesceSchema(c.Schema, config.Schema),
   273  	}
   274  }
   275  
   276  // FileOption is an interface implemented by types that carry configuration
   277  // options for parquet files.
   278  type FileOption interface {
   279  	ConfigureFile(*FileConfig)
   280  }
   281  
   282  // ReaderOption is an interface implemented by types that carry configuration
   283  // options for parquet readers.
   284  type ReaderOption interface {
   285  	ConfigureReader(*ReaderConfig)
   286  }
   287  
   288  // WriterOption is an interface implemented by types that carry configuration
   289  // options for parquet writers.
   290  type WriterOption interface {
   291  	ConfigureWriter(*WriterConfig)
   292  }
   293  
   294  // RowGroupOption is an interface implemented by types that carry configuration
   295  // options for parquet row groups.
   296  type RowGroupOption interface {
   297  	ConfigureRowGroup(*RowGroupConfig)
   298  }
   299  
   300  // SkipPageIndex is a file configuration option which prevents automatically
   301  // reading the page index when opening a parquet file, when set to true. This is
   302  // useful as an optimization when programs know that they will not need to
   303  // consume the page index.
   304  //
   305  // Defaults to false.
   306  func SkipPageIndex(skip bool) FileOption {
   307  	return fileOption(func(config *FileConfig) { config.SkipPageIndex = skip })
   308  }
   309  
   310  // SkipBloomFilters is a file configuration option which prevents automatically
   311  // reading the bloom filters when opening a parquet file, when set to true.
   312  // This is useful as an optimization when programs know that they will not need
   313  // to consume the bloom filters.
   314  //
   315  // Defaults to false.
   316  func SkipBloomFilters(skip bool) FileOption {
   317  	return fileOption(func(config *FileConfig) { config.SkipBloomFilters = skip })
   318  }
   319  
   320  // PageBufferSize configures the size of column page buffers on parquet writers.
   321  //
   322  // Note that the page buffer size refers to the in-memory buffers where pages
   323  // are generated, not the size of pages after encoding and compression.
   324  // This design choice was made to help control the amount of memory needed to
   325  // read and write pages rather than controlling the space used by the encoded
   326  // representation on disk.
   327  //
   328  // Defaults to 256KiB.
   329  func PageBufferSize(size int) WriterOption {
   330  	return writerOption(func(config *WriterConfig) { config.PageBufferSize = size })
   331  }
   332  
   333  // WriteBufferSize configures the size of the write buffer.
   334  //
   335  // Setting the writer buffer size to zero deactivates buffering, all writes are
   336  // immediately sent to the output io.Writer.
   337  //
   338  // Defaults to 32KiB.
   339  func WriteBufferSize(size int) WriterOption {
   340  	return writerOption(func(config *WriterConfig) { config.WriteBufferSize = size })
   341  }
   342  
   343  // CreatedBy creates a configuration option which sets the name of the
   344  // application that created a parquet file.
   345  //
   346  // By default, this information is omitted.
   347  func CreatedBy(createdBy string) WriterOption {
   348  	return writerOption(func(config *WriterConfig) { config.CreatedBy = createdBy })
   349  }
   350  
   351  // ColumnPageBuffers creates a configuration option to customize the buffer pool
   352  // used when constructing row groups. This can be used to provide on-disk buffers
   353  // as swap space to ensure that the parquet file creation will no be bottlenecked
   354  // on the amount of memory available.
   355  //
   356  // Defaults to using in-memory buffers.
   357  func ColumnPageBuffers(buffers PageBufferPool) WriterOption {
   358  	return writerOption(func(config *WriterConfig) { config.ColumnPageBuffers = buffers })
   359  }
   360  
   361  // ColumnIndexSizeLimit creates a configuration option to customize the size
   362  // limit of page boundaries recorded in column indexes.
   363  //
   364  // Defaults to 16.
   365  func ColumnIndexSizeLimit(sizeLimit int) WriterOption {
   366  	return writerOption(func(config *WriterConfig) { config.ColumnIndexSizeLimit = sizeLimit })
   367  }
   368  
   369  // DataPageVersion creates a configuration option which configures the version of
   370  // data pages used when creating a parquet file.
   371  //
   372  // Defaults to version 2.
   373  func DataPageVersion(version int) WriterOption {
   374  	return writerOption(func(config *WriterConfig) { config.DataPageVersion = version })
   375  }
   376  
   377  // DataPageStatistics creates a configuration option which defines whether data
   378  // page statistics are emitted. This option is useful when generating parquet
   379  // files that intend to be backward compatible with older readers which may not
   380  // have the ability to load page statistics from the column index.
   381  //
   382  // Defaults to false.
   383  func DataPageStatistics(enabled bool) WriterOption {
   384  	return writerOption(func(config *WriterConfig) { config.DataPageStatistics = enabled })
   385  }
   386  
   387  // KeyValueMetadata creates a configuration option which adds key/value metadata
   388  // to add to the metadata of parquet files.
   389  //
   390  // This option is additive, it may be used multiple times to add more than one
   391  // key/value pair.
   392  //
   393  // Keys are assumed to be unique, if the same key is repeated multiple times the
   394  // last value is retained. While the parquet format does not require unique keys,
   395  // this design decision was made to optimize for the most common use case where
   396  // applications leverage this extension mechanism to associate single values to
   397  // keys. This may create incompatibilities with other parquet libraries, or may
   398  // cause some key/value pairs to be lost when open parquet files written with
   399  // repeated keys. We can revisit this decision if it ever becomes a blocker.
   400  func KeyValueMetadata(key, value string) WriterOption {
   401  	return writerOption(func(config *WriterConfig) {
   402  		if config.KeyValueMetadata == nil {
   403  			config.KeyValueMetadata = map[string]string{key: value}
   404  		} else {
   405  			config.KeyValueMetadata[key] = value
   406  		}
   407  	})
   408  }
   409  
   410  // BloomFilters creates a configuration option which defines the bloom filters
   411  // that parquet writers should generate.
   412  //
   413  // The compute and memory footprint of generating bloom filters for all columns
   414  // of a parquet schema can be significant, so by default no filters are created
   415  // and applications need to explicitly declare the columns that they want to
   416  // create filters for.
   417  func BloomFilters(filters ...BloomFilterColumn) WriterOption {
   418  	filters = append([]BloomFilterColumn{}, filters...)
   419  	return writerOption(func(config *WriterConfig) { config.BloomFilters = filters })
   420  }
   421  
   422  // Compression creates a configuration option which sets the default compression
   423  // codec used by a writer for columns where none were defined.
   424  func Compression(codec compress.Codec) WriterOption {
   425  	return writerOption(func(config *WriterConfig) { config.Compression = codec })
   426  }
   427  
   428  // ColumnBufferCapacity creates a configuration option which defines the size of
   429  // row group column buffers.
   430  //
   431  // Defaults to 16384.
   432  func ColumnBufferCapacity(size int) RowGroupOption {
   433  	return rowGroupOption(func(config *RowGroupConfig) { config.ColumnBufferCapacity = size })
   434  }
   435  
   436  // SortingColumns creates a configuration option which defines the sorting order
   437  // of columns in a row group.
   438  //
   439  // The order of sorting columns passed as argument defines the ordering
   440  // hierarchy; when elements are equal in the first column, the second column is
   441  // used to order rows, etc...
   442  func SortingColumns(columns ...SortingColumn) interface {
   443  	RowGroupOption
   444  	WriterOption
   445  } {
   446  	// Make a copy so that we do not retain the input slice generated implicitly
   447  	// for the variable argument list, and also avoid having a nil slice when
   448  	// the option is passed with no sorting columns, so we can differentiate it
   449  	// from it not being passed.
   450  	columns = append([]SortingColumn{}, columns...)
   451  	return sortingColumns(columns)
   452  }
   453  
   454  type sortingColumns []SortingColumn
   455  
   456  func (columns sortingColumns) ConfigureRowGroup(config *RowGroupConfig) {
   457  	config.SortingColumns = columns
   458  }
   459  
   460  func (columns sortingColumns) ConfigureWriter(config *WriterConfig) {
   461  	config.SortingColumns = columns
   462  }
   463  
   464  type fileOption func(*FileConfig)
   465  
   466  func (opt fileOption) ConfigureFile(config *FileConfig) { opt(config) }
   467  
   468  type readerOption func(*ReaderConfig)
   469  
   470  func (opt readerOption) ConfigureReader(config *ReaderConfig) { opt(config) }
   471  
   472  type writerOption func(*WriterConfig)
   473  
   474  func (opt writerOption) ConfigureWriter(config *WriterConfig) { opt(config) }
   475  
   476  type rowGroupOption func(*RowGroupConfig)
   477  
   478  func (opt rowGroupOption) ConfigureRowGroup(config *RowGroupConfig) { opt(config) }
   479  
   480  func coalesceInt(i1, i2 int) int {
   481  	if i1 != 0 {
   482  		return i1
   483  	}
   484  	return i2
   485  }
   486  
   487  func coalesceInt64(i1, i2 int64) int64 {
   488  	if i1 != 0 {
   489  		return i1
   490  	}
   491  	return i2
   492  }
   493  
   494  func coalesceString(s1, s2 string) string {
   495  	if s1 != "" {
   496  		return s1
   497  	}
   498  	return s2
   499  }
   500  
   501  func coalesceBytes(b1, b2 []byte) []byte {
   502  	if b1 != nil {
   503  		return b1
   504  	}
   505  	return b2
   506  }
   507  
   508  func coalescePageBufferPool(p1, p2 PageBufferPool) PageBufferPool {
   509  	if p1 != nil {
   510  		return p1
   511  	}
   512  	return p2
   513  }
   514  
   515  func coalesceSchema(s1, s2 *Schema) *Schema {
   516  	if s1 != nil {
   517  		return s1
   518  	}
   519  	return s2
   520  }
   521  
   522  func coalesceSortingColumns(s1, s2 []SortingColumn) []SortingColumn {
   523  	if s1 != nil {
   524  		return s1
   525  	}
   526  	return s2
   527  }
   528  
   529  func coalesceBloomFilters(f1, f2 []BloomFilterColumn) []BloomFilterColumn {
   530  	if f1 != nil {
   531  		return f1
   532  	}
   533  	return f2
   534  }
   535  
   536  func coalesceCompression(c1, c2 compress.Codec) compress.Codec {
   537  	if c1 != nil {
   538  		return c1
   539  	}
   540  	return c2
   541  }
   542  
   543  func validatePositiveInt(optionName string, optionValue int) error {
   544  	if optionValue > 0 {
   545  		return nil
   546  	}
   547  	return errorInvalidOptionValue(optionName, optionValue)
   548  }
   549  
   550  func validatePositiveInt64(optionName string, optionValue int64) error {
   551  	if optionValue > 0 {
   552  		return nil
   553  	}
   554  	return errorInvalidOptionValue(optionName, optionValue)
   555  }
   556  
   557  func validateOneOfInt(optionName string, optionValue int, supportedValues ...int) error {
   558  	for _, value := range supportedValues {
   559  		if value == optionValue {
   560  			return nil
   561  		}
   562  	}
   563  	return errorInvalidOptionValue(optionName, optionValue)
   564  }
   565  
   566  func validateNotNil(optionName string, optionValue interface{}) error {
   567  	if optionValue != nil {
   568  		return nil
   569  	}
   570  	return errorInvalidOptionValue(optionName, optionValue)
   571  }
   572  
   573  func errorInvalidOptionValue(optionName string, optionValue interface{}) error {
   574  	return fmt.Errorf("invalid option value: %s: %v", optionName, optionValue)
   575  }
   576  
   577  func errorInvalidConfiguration(reasons ...error) error {
   578  	var err *invalidConfiguration
   579  
   580  	for _, reason := range reasons {
   581  		if reason != nil {
   582  			if err == nil {
   583  				err = new(invalidConfiguration)
   584  			}
   585  			err.reasons = append(err.reasons, reason)
   586  		}
   587  	}
   588  
   589  	if err != nil {
   590  		return err
   591  	}
   592  
   593  	return nil
   594  }
   595  
   596  type invalidConfiguration struct {
   597  	reasons []error
   598  }
   599  
   600  func (err *invalidConfiguration) Error() string {
   601  	errorMessage := new(strings.Builder)
   602  	for _, reason := range err.reasons {
   603  		errorMessage.WriteString(reason.Error())
   604  		errorMessage.WriteString("\n")
   605  	}
   606  	errorString := errorMessage.String()
   607  	if errorString != "" {
   608  		errorString = errorString[:len(errorString)-1]
   609  	}
   610  	return errorString
   611  }
   612  
   613  var (
   614  	_ FileOption     = (*FileConfig)(nil)
   615  	_ ReaderOption   = (*ReaderConfig)(nil)
   616  	_ WriterOption   = (*WriterConfig)(nil)
   617  	_ RowGroupOption = (*RowGroupConfig)(nil)
   618  )