github.com/apache/arrow/go/v10@v10.0.1/parquet/writer_properties.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package parquet
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v10/arrow/memory"
    21  	"github.com/apache/arrow/go/v10/parquet/compress"
    22  )
    23  
    24  // Constants for default property values used for the default reader, writer and column props.
    25  const (
    26  	// Default Buffer size used for the Reader
    27  	DefaultBufSize int64 = 4096 * 4
    28  	// Default data page size limit is 1K it's not guaranteed, but we will try to
    29  	// cut data pages off at this size where possible.
    30  	DefaultDataPageSize int64 = 1024 * 1024
    31  	// Default is for dictionary encoding to be turned on, use WithDictionaryDefault
    32  	// writer property to change that.
    33  	DefaultDictionaryEnabled = true
    34  	// If the dictionary reaches the size of this limitation, the writer will use
    35  	// the fallback encoding (usually plain) instead of continuing to build the
    36  	// dictionary index.
    37  	DefaultDictionaryPageSizeLimit = DefaultDataPageSize
    38  	// In order to attempt to facilitate data page size limits for writing,
    39  	// data is written in batches. Increasing the batch size may improve performance
    40  	// but the larger the batch size, the easier it is to overshoot the datapage limit.
    41  	DefaultWriteBatchSize int64 = 1024
    42  	// Default maximum number of rows for a single row group
    43  	DefaultMaxRowGroupLen int64 = 64 * 1024 * 1024
    44  	// Default is to have stats enabled for all columns, use writer properties to
    45  	// change the default, or to enable/disable for specific columns.
    46  	DefaultStatsEnabled = true
    47  	// If the stats are larger than 4K the writer will skip writing them out anyways.
    48  	DefaultMaxStatsSize int64 = 4096
    49  	DefaultCreatedBy          = "parquet-go version 10.0.1"
    50  	DefaultRootName           = "schema"
    51  )
    52  
    53  // ColumnProperties defines the encoding, codec, and so on for a given column.
    54  type ColumnProperties struct {
    55  	Encoding          Encoding
    56  	Codec             compress.Compression
    57  	DictionaryEnabled bool
    58  	StatsEnabled      bool
    59  	MaxStatsSize      int64
    60  	CompressionLevel  int
    61  }
    62  
    63  // DefaultColumnProperties returns the default properties which get utilized for writing.
    64  //
    65  // The default column properties are the following constants:
    66  //	Encoding:						Encodings.Plain
    67  //	Codec:							compress.Codecs.Uncompressed
    68  //	DictionaryEnabled:	DefaultDictionaryEnabled
    69  //	StatsEnabled:				DefaultStatsEnabled
    70  //	MaxStatsSize:				DefaultMaxStatsSize
    71  //	CompressionLevel:		compress.DefaultCompressionLevel
    72  func DefaultColumnProperties() ColumnProperties {
    73  	return ColumnProperties{
    74  		Encoding:          Encodings.Plain,
    75  		Codec:             compress.Codecs.Uncompressed,
    76  		DictionaryEnabled: DefaultDictionaryEnabled,
    77  		StatsEnabled:      DefaultStatsEnabled,
    78  		MaxStatsSize:      DefaultMaxStatsSize,
    79  		CompressionLevel:  compress.DefaultCompressionLevel,
    80  	}
    81  }
    82  
    83  type writerPropConfig struct {
    84  	wr            *WriterProperties
    85  	encodings     map[string]Encoding
    86  	codecs        map[string]compress.Compression
    87  	compressLevel map[string]int
    88  	dictEnabled   map[string]bool
    89  	statsEnabled  map[string]bool
    90  }
    91  
    92  // WriterProperty is used as the options for building a writer properties instance
    93  type WriterProperty func(*writerPropConfig)
    94  
    95  // WithAllocator specifies the writer to use the given allocator
    96  func WithAllocator(mem memory.Allocator) WriterProperty {
    97  	return func(cfg *writerPropConfig) {
    98  		cfg.wr.mem = mem
    99  	}
   100  }
   101  
   102  // WithDictionaryDefault sets the default value for whether to enable dictionary encoding
   103  func WithDictionaryDefault(dict bool) WriterProperty {
   104  	return func(cfg *writerPropConfig) {
   105  		cfg.wr.defColumnProps.DictionaryEnabled = dict
   106  	}
   107  }
   108  
   109  // WithDictionaryFor allows enabling or disabling dictionary encoding for a given column path string
   110  func WithDictionaryFor(path string, dict bool) WriterProperty {
   111  	return func(cfg *writerPropConfig) {
   112  		cfg.dictEnabled[path] = dict
   113  	}
   114  }
   115  
   116  // WithDictionaryPath is like WithDictionaryFor, but takes a ColumnPath type
   117  func WithDictionaryPath(path ColumnPath, dict bool) WriterProperty {
   118  	return WithDictionaryFor(path.String(), dict)
   119  }
   120  
   121  // WithDictionaryPageSizeLimit is the limit of the dictionary at which the writer
   122  // will fallback to plain encoding instead
   123  func WithDictionaryPageSizeLimit(limit int64) WriterProperty {
   124  	return func(cfg *writerPropConfig) {
   125  		cfg.wr.dictPagesize = limit
   126  	}
   127  }
   128  
   129  // WithBatchSize specifies the number of rows to use for batch writes to columns
   130  func WithBatchSize(batch int64) WriterProperty {
   131  	return func(cfg *writerPropConfig) {
   132  		cfg.wr.batchSize = batch
   133  	}
   134  }
   135  
   136  // WithMaxRowGroupLength specifies the number of rows as the maximum number of rows for a given row group in the writer.
   137  func WithMaxRowGroupLength(nrows int64) WriterProperty {
   138  	return func(cfg *writerPropConfig) {
   139  		cfg.wr.maxRowGroupLen = nrows
   140  	}
   141  }
   142  
   143  // WithDataPageSize specifies the size to use for splitting data pages for column writing.
   144  func WithDataPageSize(pgsize int64) WriterProperty {
   145  	return func(cfg *writerPropConfig) {
   146  		cfg.wr.pageSize = pgsize
   147  	}
   148  }
   149  
   150  // WithDataPageVersion specifies whether to use Version 1 or Version 2 of the DataPage spec
   151  func WithDataPageVersion(version DataPageVersion) WriterProperty {
   152  	return func(cfg *writerPropConfig) {
   153  		cfg.wr.dataPageVersion = version
   154  	}
   155  }
   156  
   157  // WithVersion specifies which Parquet Spec version to utilize for writing.
   158  func WithVersion(version Version) WriterProperty {
   159  	return func(cfg *writerPropConfig) {
   160  		cfg.wr.parquetVersion = version
   161  	}
   162  }
   163  
   164  // WithCreatedBy specifies the "created by" string to use for the writer
   165  func WithCreatedBy(createdby string) WriterProperty {
   166  	return func(cfg *writerPropConfig) {
   167  		cfg.wr.createdBy = createdby
   168  	}
   169  }
   170  
   171  // WithRootName enables customization of the name used for the root schema node. This is required
   172  // to maintain compatibility with other tools.
   173  func WithRootName(name string) WriterProperty {
   174  	return func(cfg *writerPropConfig) {
   175  		cfg.wr.rootName = name
   176  	}
   177  }
   178  
   179  // WithRootRepetition enables customization of the repetition used for the root schema node.
   180  // This is required to maintain compatibility with other tools.
   181  func WithRootRepetition(repetition Repetition) WriterProperty {
   182  	return func(cfg *writerPropConfig) {
   183  		cfg.wr.rootRepetition = repetition
   184  	}
   185  }
   186  
   187  // WithEncoding defines the encoding that is used when we aren't using dictionary encoding.
   188  //
   189  // This is either applied if dictionary encoding is disabled, or if we fallback if the dictionary
   190  // grew too large.
   191  func WithEncoding(encoding Encoding) WriterProperty {
   192  	return func(cfg *writerPropConfig) {
   193  		if encoding == Encodings.PlainDict || encoding == Encodings.RLEDict {
   194  			panic("parquet: can't use dictionary encoding as fallback encoding")
   195  		}
   196  		cfg.wr.defColumnProps.Encoding = encoding
   197  	}
   198  }
   199  
   200  // WithEncodingFor is for defining the encoding only for a specific column path. This encoding will be used
   201  // if dictionary encoding is disabled for the column or if we fallback because the dictionary grew too large
   202  func WithEncodingFor(path string, encoding Encoding) WriterProperty {
   203  	return func(cfg *writerPropConfig) {
   204  		if encoding == Encodings.PlainDict || encoding == Encodings.RLEDict {
   205  			panic("parquet: can't use dictionary encoding as fallback encoding")
   206  		}
   207  		cfg.encodings[path] = encoding
   208  	}
   209  }
   210  
   211  // WithEncodingPath is the same as WithEncodingFor but takes a ColumnPath directly.
   212  func WithEncodingPath(path ColumnPath, encoding Encoding) WriterProperty {
   213  	return WithEncodingFor(path.String(), encoding)
   214  }
   215  
   216  // WithCompression specifies the default compression type to use for column writing.
   217  func WithCompression(codec compress.Compression) WriterProperty {
   218  	return func(cfg *writerPropConfig) {
   219  		cfg.wr.defColumnProps.Codec = codec
   220  	}
   221  }
   222  
   223  // WithCompressionFor specifies the compression type for the given column.
   224  func WithCompressionFor(path string, codec compress.Compression) WriterProperty {
   225  	return func(cfg *writerPropConfig) {
   226  		cfg.codecs[path] = codec
   227  	}
   228  }
   229  
   230  // WithCompressionPath is the same as WithCompressionFor but takes a ColumnPath directly.
   231  func WithCompressionPath(path ColumnPath, codec compress.Compression) WriterProperty {
   232  	return WithCompressionFor(path.String(), codec)
   233  }
   234  
   235  // WithMaxStatsSize sets a maximum size for the statistics before we decide not to include them.
   236  func WithMaxStatsSize(maxStatsSize int64) WriterProperty {
   237  	return func(cfg *writerPropConfig) {
   238  		cfg.wr.defColumnProps.MaxStatsSize = maxStatsSize
   239  	}
   240  }
   241  
   242  // WithCompressionLevel specifies the default compression level for the compressor in every column.
   243  //
   244  // The provided compression level is compressor specific. The user would have to know what the available
   245  // levels are for the selected compressor. If the compressor does not allow for selecting different
   246  // compression levels, then this function will have no effect. Parquet and Arrow will not validate the
   247  // passed compression level. If no level is selected by the user or if the special compress.DefaultCompressionLevel
   248  // value is used, then parquet will select the compression level.
   249  func WithCompressionLevel(level int) WriterProperty {
   250  	return func(cfg *writerPropConfig) {
   251  		cfg.wr.defColumnProps.CompressionLevel = level
   252  	}
   253  }
   254  
   255  // WithCompressionLevelFor is like WithCompressionLevel but only for the given column path.
   256  func WithCompressionLevelFor(path string, level int) WriterProperty {
   257  	return func(cfg *writerPropConfig) {
   258  		cfg.compressLevel[path] = level
   259  	}
   260  }
   261  
   262  // WithCompressionLevelPath is the same as WithCompressionLevelFor but takes a ColumnPath
   263  func WithCompressionLevelPath(path ColumnPath, level int) WriterProperty {
   264  	return WithCompressionLevelFor(path.String(), level)
   265  }
   266  
   267  // WithStats specifies a default for whether or not to enable column statistics.
   268  func WithStats(enabled bool) WriterProperty {
   269  	return func(cfg *writerPropConfig) {
   270  		cfg.wr.defColumnProps.StatsEnabled = enabled
   271  	}
   272  }
   273  
   274  // WithStatsFor specifies a per column value as to enable or disable statistics in the resulting file.
   275  func WithStatsFor(path string, enabled bool) WriterProperty {
   276  	return func(cfg *writerPropConfig) {
   277  		cfg.statsEnabled[path] = enabled
   278  	}
   279  }
   280  
   281  // WithStatsPath is the same as WithStatsFor but takes a ColumnPath
   282  func WithStatsPath(path ColumnPath, enabled bool) WriterProperty {
   283  	return WithStatsFor(path.String(), enabled)
   284  }
   285  
   286  // WithEncryptionProperties specifies the file level encryption handling for writing the file.
   287  func WithEncryptionProperties(props *FileEncryptionProperties) WriterProperty {
   288  	return func(cfg *writerPropConfig) {
   289  		cfg.wr.encryptionProps = props
   290  	}
   291  }
   292  
   293  // WriterProperties is the collection of properties to use for writing a parquet file. The values are
   294  // read only once it has been constructed.
   295  type WriterProperties struct {
   296  	mem             memory.Allocator
   297  	dictPagesize    int64
   298  	batchSize       int64
   299  	maxRowGroupLen  int64
   300  	pageSize        int64
   301  	parquetVersion  Version
   302  	createdBy       string
   303  	dataPageVersion DataPageVersion
   304  	rootName        string
   305  	rootRepetition  Repetition
   306  
   307  	defColumnProps  ColumnProperties
   308  	columnProps     map[string]*ColumnProperties
   309  	encryptionProps *FileEncryptionProperties
   310  }
   311  
   312  func defaultWriterProperties() *WriterProperties {
   313  	return &WriterProperties{
   314  		mem:             memory.DefaultAllocator,
   315  		dictPagesize:    DefaultDictionaryPageSizeLimit,
   316  		batchSize:       DefaultWriteBatchSize,
   317  		maxRowGroupLen:  DefaultMaxRowGroupLen,
   318  		pageSize:        DefaultDataPageSize,
   319  		parquetVersion:  V2_LATEST,
   320  		dataPageVersion: DataPageV1,
   321  		createdBy:       DefaultCreatedBy,
   322  		rootName:        DefaultRootName,
   323  		rootRepetition:  Repetitions.Repeated,
   324  		defColumnProps:  DefaultColumnProperties(),
   325  	}
   326  }
   327  
   328  // NewWriterProperties takes a list of options for building the properties. If multiple options are used which conflict
   329  // then the last option is the one which will take effect. If no WriterProperty options are provided, then the default
   330  // properties will be utilized for writing.
   331  //
   332  // The Default properties use the following constants:
   333  //	Allocator:					memory.DefaultAllocator
   334  // 	DictionaryPageSize: DefaultDictionaryPageSizeLimit
   335  //	BatchSize:					DefaultWriteBatchSize
   336  //	MaxRowGroupLength:	DefaultMaxRowGroupLen
   337  //	PageSize:						DefaultDataPageSize
   338  //	ParquetVersion:			V1
   339  //	DataPageVersion:		DataPageV1
   340  //	CreatedBy:					DefaultCreatedBy
   341  func NewWriterProperties(opts ...WriterProperty) *WriterProperties {
   342  	cfg := writerPropConfig{
   343  		wr:            defaultWriterProperties(),
   344  		encodings:     make(map[string]Encoding),
   345  		codecs:        make(map[string]compress.Compression),
   346  		compressLevel: make(map[string]int),
   347  		dictEnabled:   make(map[string]bool),
   348  		statsEnabled:  make(map[string]bool),
   349  	}
   350  	for _, o := range opts {
   351  		o(&cfg)
   352  	}
   353  
   354  	cfg.wr.columnProps = make(map[string]*ColumnProperties)
   355  	get := func(key string) *ColumnProperties {
   356  		if p, ok := cfg.wr.columnProps[key]; ok {
   357  			return p
   358  		}
   359  		cfg.wr.columnProps[key] = new(ColumnProperties)
   360  		*cfg.wr.columnProps[key] = cfg.wr.defColumnProps
   361  		return cfg.wr.columnProps[key]
   362  	}
   363  
   364  	for key, value := range cfg.encodings {
   365  		get(key).Encoding = value
   366  	}
   367  
   368  	for key, value := range cfg.codecs {
   369  		get(key).Codec = value
   370  	}
   371  
   372  	for key, value := range cfg.compressLevel {
   373  		get(key).CompressionLevel = value
   374  	}
   375  
   376  	for key, value := range cfg.dictEnabled {
   377  		get(key).DictionaryEnabled = value
   378  	}
   379  
   380  	for key, value := range cfg.statsEnabled {
   381  		get(key).StatsEnabled = value
   382  	}
   383  	return cfg.wr
   384  }
   385  
   386  // FileEncryptionProperties returns the current encryption properties that were
   387  // used to create the writer properties.
   388  func (w *WriterProperties) FileEncryptionProperties() *FileEncryptionProperties {
   389  	return w.encryptionProps
   390  }
   391  
   392  func (w *WriterProperties) Allocator() memory.Allocator      { return w.mem }
   393  func (w *WriterProperties) CreatedBy() string                { return w.createdBy }
   394  func (w *WriterProperties) RootName() string                 { return w.rootName }
   395  func (w *WriterProperties) RootRepetition() Repetition       { return w.rootRepetition }
   396  func (w *WriterProperties) WriteBatchSize() int64            { return w.batchSize }
   397  func (w *WriterProperties) DataPageSize() int64              { return w.pageSize }
   398  func (w *WriterProperties) DictionaryPageSizeLimit() int64   { return w.dictPagesize }
   399  func (w *WriterProperties) Version() Version                 { return w.parquetVersion }
   400  func (w *WriterProperties) DataPageVersion() DataPageVersion { return w.dataPageVersion }
   401  func (w *WriterProperties) MaxRowGroupLength() int64         { return w.maxRowGroupLen }
   402  
   403  // Compression returns the default compression type that will be used for any columns that don't
   404  // have a specific compression defined.
   405  func (w *WriterProperties) Compression() compress.Compression { return w.defColumnProps.Codec }
   406  
   407  // CompressionFor will return the compression type that is specified for the given column path, or
   408  // the default compression codec if there isn't one specific to this column.
   409  func (w *WriterProperties) CompressionFor(path string) compress.Compression {
   410  	if p, ok := w.columnProps[path]; ok {
   411  		return p.Codec
   412  	}
   413  	return w.defColumnProps.Codec
   414  }
   415  
   416  //CompressionPath is the same as CompressionFor but takes a ColumnPath
   417  func (w *WriterProperties) CompressionPath(path ColumnPath) compress.Compression {
   418  	return w.CompressionFor(path.String())
   419  }
   420  
   421  // CompressionLevel returns the default compression level that will be used for any column
   422  // that doesn't have a compression level specified for it.
   423  func (w *WriterProperties) CompressionLevel() int { return w.defColumnProps.CompressionLevel }
   424  
   425  // CompressionLevelFor returns the compression level that will be utilized for the given column,
   426  // or the default compression level if the column doesn't have a specific level specified.
   427  func (w *WriterProperties) CompressionLevelFor(path string) int {
   428  	if p, ok := w.columnProps[path]; ok {
   429  		return p.CompressionLevel
   430  	}
   431  	return w.defColumnProps.CompressionLevel
   432  }
   433  
   434  // CompressionLevelPath is the same as CompressionLevelFor but takes a ColumnPath object
   435  func (w *WriterProperties) CompressionLevelPath(path ColumnPath) int {
   436  	return w.CompressionLevelFor(path.String())
   437  }
   438  
   439  // Encoding returns the default encoding that will be utilized for any columns which don't have a different value
   440  // specified.
   441  func (w *WriterProperties) Encoding() Encoding { return w.defColumnProps.Encoding }
   442  
   443  // EncodingFor returns the encoding that will be used for the given column path, or the default encoding if there
   444  // isn't one specified for this column.
   445  func (w *WriterProperties) EncodingFor(path string) Encoding {
   446  	if p, ok := w.columnProps[path]; ok {
   447  		return p.Encoding
   448  	}
   449  	return w.defColumnProps.Encoding
   450  }
   451  
   452  // EncodingPath is the same as EncodingFor but takes a ColumnPath object
   453  func (w *WriterProperties) EncodingPath(path ColumnPath) Encoding {
   454  	return w.EncodingFor(path.String())
   455  }
   456  
   457  // DictionaryIndexEncoding returns which encoding will be used for the Dictionary Index values based on the
   458  // parquet version. V1 uses PlainDict and V2 uses RLEDict
   459  func (w *WriterProperties) DictionaryIndexEncoding() Encoding {
   460  	if w.parquetVersion == V1_0 {
   461  		return Encodings.PlainDict
   462  	}
   463  	return Encodings.RLEDict
   464  }
   465  
   466  // DictionaryPageEncoding returns the encoding that will be utilized for the DictionaryPage itself based on the parquet
   467  // version. V1 uses PlainDict, v2 uses Plain
   468  func (w *WriterProperties) DictionaryPageEncoding() Encoding {
   469  	if w.parquetVersion == V1_0 {
   470  		return Encodings.PlainDict
   471  	}
   472  	return Encodings.Plain
   473  }
   474  
   475  // DictionaryEnabled returns the default value as for whether or not dictionary encoding will be utilized for columns
   476  // that aren't separately specified.
   477  func (w *WriterProperties) DictionaryEnabled() bool { return w.defColumnProps.DictionaryEnabled }
   478  
   479  // DictionaryEnabledFor returns whether or not dictionary encoding will be used for the specified column when writing
   480  // or the default value if the column was not separately specified.
   481  func (w *WriterProperties) DictionaryEnabledFor(path string) bool {
   482  	if p, ok := w.columnProps[path]; ok {
   483  		return p.DictionaryEnabled
   484  	}
   485  	return w.defColumnProps.DictionaryEnabled
   486  }
   487  
   488  // DictionaryEnabledPath is the same as DictionaryEnabledFor but takes a ColumnPath object.
   489  func (w *WriterProperties) DictionaryEnabledPath(path ColumnPath) bool {
   490  	return w.DictionaryEnabledFor(path.String())
   491  }
   492  
   493  // StatisticsEnabled returns the default value for whether or not stats are enabled to be written for columns
   494  // that aren't separately specified.
   495  func (w *WriterProperties) StatisticsEnabled() bool { return w.defColumnProps.StatsEnabled }
   496  
   497  // StatisticsEnabledFor returns whether stats will be written for the given column path, or the default value if
   498  // it wasn't separately specified.
   499  func (w *WriterProperties) StatisticsEnabledFor(path string) bool {
   500  	if p, ok := w.columnProps[path]; ok {
   501  		return p.StatsEnabled
   502  	}
   503  	return w.defColumnProps.StatsEnabled
   504  }
   505  
   506  // StatisticsEnabledPath is the same as StatisticsEnabledFor but takes a ColumnPath object.
   507  func (w *WriterProperties) StatisticsEnabledPath(path ColumnPath) bool {
   508  	return w.StatisticsEnabledFor(path.String())
   509  }
   510  
   511  // MaxStatsSize returns the default maximum size for stats
   512  func (w *WriterProperties) MaxStatsSize() int64 { return w.defColumnProps.MaxStatsSize }
   513  
   514  // MaxStatsSizeFor returns the maximum stat size for the given column path
   515  func (w *WriterProperties) MaxStatsSizeFor(path string) int64 {
   516  	if p, ok := w.columnProps[path]; ok {
   517  		return p.MaxStatsSize
   518  	}
   519  	return w.defColumnProps.MaxStatsSize
   520  }
   521  
   522  // MaxStatsSizePath is the same as MaxStatsSizeFor but takes a ColumnPath
   523  func (w *WriterProperties) MaxStatsSizePath(path ColumnPath) int64 {
   524  	return w.MaxStatsSizeFor(path.String())
   525  }
   526  
   527  // ColumnEncryptionProperties returns the specific properties for encryption that will be used for the given column path
   528  func (w *WriterProperties) ColumnEncryptionProperties(path string) *ColumnEncryptionProperties {
   529  	if w.encryptionProps != nil {
   530  		return w.encryptionProps.ColumnEncryptionProperties(path)
   531  	}
   532  	return nil
   533  }