github.com/fraugster/parquet-go@v0.12.0/file_writer.go

github.com/fraugster/parquet-go@v0.12.0/file_writer.go (about)

     1  package goparquet
     2  
     3  import (
     4  	"bufio"
     5  	"context"
     6  	"encoding/binary"
     7  	"io"
     8  
     9  	"github.com/fraugster/parquet-go/parquet"
    10  	"github.com/fraugster/parquet-go/parquetschema"
    11  )
    12  
    13  // FileWriter is used to write data to a parquet file. Always use NewFileWriter
    14  // to create such an object.
    15  type FileWriter struct {
    16  	w  writePos
    17  	bw *bufio.Writer
    18  
    19  	version int32
    20  	//SchemaWriter
    21  
    22  	schemaWriter *schema
    23  
    24  	totalNumRecords int64
    25  	kvStore         map[string]string
    26  	createdBy       string
    27  
    28  	rowGroupFlushSize int64
    29  
    30  	rowGroups []*parquet.RowGroup
    31  
    32  	codec parquet.CompressionCodec
    33  
    34  	newPageFunc newDataPageFunc
    35  
    36  	ctx context.Context
    37  
    38  	schemaDef *parquetschema.SchemaDefinition
    39  }
    40  
    41  // FileWriterOption describes an option function that is applied to a FileWriter when it is created.
    42  type FileWriterOption func(fw *FileWriter)
    43  
    44  // NewFileWriter creates a new FileWriter. You can provide FileWriterOptions to influence the
    45  // file writer's behaviour.
    46  func NewFileWriter(w io.Writer, options ...FileWriterOption) *FileWriter {
    47  	bw := bufio.NewWriter(w)
    48  	fw := &FileWriter{
    49  		w: &writePosStruct{
    50  			w:   bw,
    51  			pos: 0,
    52  		},
    53  		bw:           bw,
    54  		version:      1,
    55  		schemaWriter: &schema{}, // no allocTracker is set here because we're creating a writer. We assume for the moment that writers have enough control over input that they're trusted.
    56  		kvStore:      make(map[string]string),
    57  		rowGroups:    []*parquet.RowGroup{},
    58  		createdBy:    "parquet-go",
    59  		newPageFunc:  newDataPageV1Writer,
    60  		ctx:          context.Background(),
    61  	}
    62  
    63  	for _, opt := range options {
    64  		opt(fw)
    65  	}
    66  
    67  	// if a WithSchemaDefinition option was provided, the schema needs to be set after everything else
    68  	// as other options can change settings on the schemaWriter (such as the maximum page size).
    69  	if fw.schemaDef != nil {
    70  		if err := fw.schemaWriter.SetSchemaDefinition(fw.schemaDef); err != nil {
    71  			panic(err) // TODO: this shouldn't happen, but still isn't great. We messed up the API design for options and NewFileWriter.
    72  		}
    73  	}
    74  
    75  	return fw
    76  }
    77  
    78  // FileVersion sets the version of the file itself.
    79  func FileVersion(version int32) FileWriterOption {
    80  	return func(fw *FileWriter) {
    81  		fw.version = version
    82  	}
    83  }
    84  
    85  // WithCreator sets the creator in the meta data of the file.
    86  func WithCreator(createdBy string) FileWriterOption {
    87  	return func(fw *FileWriter) {
    88  		fw.createdBy = createdBy
    89  	}
    90  }
    91  
    92  // WithCompressionCodec sets the compression codec used when writing the file.
    93  func WithCompressionCodec(codec parquet.CompressionCodec) FileWriterOption {
    94  	return func(fw *FileWriter) {
    95  		fw.codec = codec
    96  	}
    97  }
    98  
    99  // WithMetaData sets the key-value meta data on the file.
   100  func WithMetaData(data map[string]string) FileWriterOption {
   101  	return func(fw *FileWriter) {
   102  		if data != nil {
   103  			fw.kvStore = data
   104  			return
   105  		}
   106  		fw.kvStore = make(map[string]string)
   107  	}
   108  }
   109  
   110  // WithMaxRowGroupSize sets the rough maximum size of a row group before it shall
   111  // be flushed automatically. Please note that enabling auto-flush will not allow
   112  // you to set per-column-chunk meta-data upon calling FlushRowGroup. If you
   113  // require this feature, you need to flush your rowgroups manually.
   114  func WithMaxRowGroupSize(size int64) FileWriterOption {
   115  	return func(fw *FileWriter) {
   116  		fw.rowGroupFlushSize = size
   117  	}
   118  }
   119  
   120  func WithMaxPageSize(size int64) FileWriterOption {
   121  	return func(fw *FileWriter) {
   122  		fw.schemaWriter.maxPageSize = size
   123  	}
   124  }
   125  
   126  // WithSchemaDefinition sets the schema definition to use for this parquet file.
   127  func WithSchemaDefinition(sd *parquetschema.SchemaDefinition) FileWriterOption {
   128  	return func(fw *FileWriter) {
   129  		fw.schemaDef = sd
   130  	}
   131  }
   132  
   133  // WithDataPageV2 enables the writer to write pages in the new V2 format. By default,
   134  // the library is using the V1 format. Please be aware that this may cause compatibility
   135  // issues with older implementations of parquet.
   136  func WithDataPageV2() FileWriterOption {
   137  	return func(fw *FileWriter) {
   138  		fw.newPageFunc = newDataPageV2Writer
   139  	}
   140  }
   141  
   142  func WithCRC(enableCRC bool) FileWriterOption {
   143  	return func(fw *FileWriter) {
   144  		fw.schemaWriter.enableCRC = enableCRC
   145  	}
   146  }
   147  
   148  // WithWriterContext overrides the default context (which is a context.Background())
   149  // in the FileWriter with the provided context.Context object.
   150  func WithWriterContext(ctx context.Context) FileWriterOption {
   151  	return func(fw *FileWriter) {
   152  		fw.ctx = ctx
   153  	}
   154  }
   155  
   156  type columnKeyValues struct {
   157  	path ColumnPath
   158  	kv   map[string]string
   159  }
   160  
   161  type flushRowGroupOptionHandle struct {
   162  	cols   []columnKeyValues
   163  	global map[string]string
   164  }
   165  
   166  func newFlushRowGroupOptionHandle() *flushRowGroupOptionHandle {
   167  	return &flushRowGroupOptionHandle{
   168  		global: make(map[string]string),
   169  	}
   170  }
   171  
   172  func (h *flushRowGroupOptionHandle) getMetaData(path ColumnPath) map[string]string {
   173  	data := make(map[string]string)
   174  
   175  	for k, v := range h.global {
   176  		data[k] = v
   177  	}
   178  
   179  	for _, col := range h.cols {
   180  		if col.path.Equal(path) {
   181  			for k, v := range col.kv {
   182  				data[k] = v
   183  			}
   184  		}
   185  	}
   186  
   187  	if len(data) > 0 {
   188  		return data
   189  	}
   190  	return nil
   191  }
   192  
   193  // FlushRowGroupOption is an option to pass additiona configuration to FlushRowGroup.
   194  type FlushRowGroupOption func(h *flushRowGroupOptionHandle)
   195  
   196  // WithRowGroupMetaDataForColumn adds key-value metadata to a particular column that is identified
   197  // by its full dotted-notation name.
   198  //
   199  // Deprecated: use WithRowGroupMetaDataForColumnPath instead.
   200  func WithRowGroupMetaDataForColumn(col string, kv map[string]string) FlushRowGroupOption {
   201  	return WithRowGroupMetaDataForColumnPath(parseColumnPath(col), kv)
   202  }
   203  
   204  // WithRowGroupMetaDataForColumnPath adds key-value metadata to a particular column that is identified
   205  // by its ColumnPath.
   206  func WithRowGroupMetaDataForColumnPath(path ColumnPath, kv map[string]string) FlushRowGroupOption {
   207  	return func(h *flushRowGroupOptionHandle) {
   208  		// at this point, we don't worry if we have multiple records for the same column.
   209  		// All the data will get merged in getMetaData.
   210  		h.cols = append(h.cols, columnKeyValues{
   211  			path: path,
   212  			kv:   kv,
   213  		})
   214  	}
   215  }
   216  
   217  // WithRowGroupMetaData adds key-value metadata to all columns. Please note that if you use the same
   218  // key both in the meta data for all columns as well as in column-specific meta data
   219  // (using MetaDataForColumn), the column-specific meta data has preference.
   220  func WithRowGroupMetaData(kv map[string]string) FlushRowGroupOption {
   221  	return func(h *flushRowGroupOptionHandle) {
   222  		for k, v := range kv {
   223  			h.global[k] = v
   224  		}
   225  	}
   226  }
   227  
   228  // FlushRowGroup writes the current row group to the parquet file.
   229  func (fw *FileWriter) FlushRowGroup(opts ...FlushRowGroupOption) error {
   230  	return fw.FlushRowGroupWithContext(fw.ctx, opts...)
   231  }
   232  
   233  // FlushRowGroupWithContext writes the current row group to the parquet file.
   234  func (fw *FileWriter) FlushRowGroupWithContext(ctx context.Context, opts ...FlushRowGroupOption) error {
   235  	// Write the entire row group
   236  	if fw.schemaWriter.rowGroupNumRecords() == 0 {
   237  		return nil
   238  	}
   239  
   240  	if fw.w.Pos() == 0 {
   241  		if err := writeFull(fw.w, magic); err != nil {
   242  			return err
   243  		}
   244  	}
   245  
   246  	h := newFlushRowGroupOptionHandle()
   247  
   248  	for _, o := range opts {
   249  		o(h)
   250  	}
   251  
   252  	cc, err := writeRowGroup(ctx, fw.w, fw.schemaWriter, fw.codec, fw.newPageFunc, h)
   253  	if err != nil {
   254  		return err
   255  	}
   256  
   257  	var totalCompressedSize, totalUncompressedSize int64
   258  
   259  	for _, c := range cc {
   260  		totalCompressedSize += c.MetaData.TotalCompressedSize
   261  		totalUncompressedSize += c.MetaData.TotalUncompressedSize
   262  	}
   263  
   264  	fw.rowGroups = append(fw.rowGroups, &parquet.RowGroup{
   265  		Columns:             cc,
   266  		TotalByteSize:       totalUncompressedSize,
   267  		TotalCompressedSize: &totalCompressedSize,
   268  		NumRows:             fw.schemaWriter.rowGroupNumRecords(),
   269  		SortingColumns:      nil,
   270  	})
   271  	fw.totalNumRecords += fw.schemaWriter.rowGroupNumRecords()
   272  	// flush the schema
   273  	fw.schemaWriter.resetData()
   274  
   275  	return nil
   276  }
   277  
   278  // AddData adds a new record to the current row group and flushes it if auto-flush is enabled and the size
   279  // is equal to or greater than the configured maximum row group size.
   280  func (fw *FileWriter) AddData(m map[string]interface{}) error {
   281  	if err := fw.schemaWriter.AddData(m); err != nil {
   282  		return err
   283  	}
   284  
   285  	if fw.rowGroupFlushSize > 0 && fw.schemaWriter.DataSize() >= fw.rowGroupFlushSize {
   286  		return fw.FlushRowGroup()
   287  	}
   288  
   289  	return nil
   290  }
   291  
   292  // Close flushes the current row group if necessary, taking the provided
   293  // options into account, and writes the meta data footer to the file.
   294  // Please be aware that this only finalizes the writing process. If you
   295  // provided a file as io.Writer when creating the FileWriter, you still need
   296  // to Close that file handle separately.
   297  func (fw *FileWriter) Close(opts ...FlushRowGroupOption) error {
   298  	return fw.CloseWithContext(fw.ctx, opts...)
   299  }
   300  
   301  // CloseWithContext flushes the current row group if necessary, taking the provided
   302  // options into account, and writes the meta data footer to the file.
   303  // Please be aware that this only finalizes the writing process. If you
   304  // provided a file as io.Writer when creating the FileWriter, you still need
   305  // to Close that file handle separately.
   306  func (fw *FileWriter) CloseWithContext(ctx context.Context, opts ...FlushRowGroupOption) error {
   307  	if fw.schemaWriter.rowGroupNumRecords() > 0 {
   308  		if err := fw.FlushRowGroup(opts...); err != nil {
   309  			return err
   310  		}
   311  	}
   312  
   313  	kv := make([]*parquet.KeyValue, 0, len(fw.kvStore))
   314  	for i := range fw.kvStore {
   315  		v := fw.kvStore[i]
   316  		addr := &v
   317  		if v == "" {
   318  			addr = nil
   319  		}
   320  		kv = append(kv, &parquet.KeyValue{
   321  			Key:   i,
   322  			Value: addr,
   323  		})
   324  	}
   325  	meta := &parquet.FileMetaData{
   326  		Version:          fw.version,
   327  		Schema:           fw.schemaWriter.getSchemaArray(),
   328  		NumRows:          fw.totalNumRecords,
   329  		RowGroups:        fw.rowGroups,
   330  		KeyValueMetadata: kv,
   331  		CreatedBy:        &fw.createdBy,
   332  		ColumnOrders:     nil,
   333  	}
   334  
   335  	pos := fw.w.Pos()
   336  	if err := writeThrift(ctx, meta, fw.w); err != nil {
   337  		return err
   338  	}
   339  
   340  	ln := int32(fw.w.Pos() - pos)
   341  	if err := binary.Write(fw.w, binary.LittleEndian, &ln); err != nil {
   342  		return err
   343  	}
   344  
   345  	if err := writeFull(fw.w, magic); err != nil {
   346  		return err
   347  	}
   348  
   349  	return fw.bw.Flush()
   350  }
   351  
   352  // CurrentRowGroupSize returns a rough estimation of the uncompressed size of the current row group data. If you selected
   353  // a compression format other than UNCOMPRESSED, the final size will most likely be smaller and will dpeend on how well
   354  // your data can be compressed.
   355  func (fw *FileWriter) CurrentRowGroupSize() int64 {
   356  	return fw.schemaWriter.DataSize()
   357  }
   358  
   359  // CurrentFileSize returns the amount of data written to the file so far. This does not include data that is in the
   360  // current row group and has not been flushed yet. After closing the file, the size will be even larger since the
   361  // footer is appended to the file upon closing.
   362  func (fw *FileWriter) CurrentFileSize() int64 {
   363  	return fw.w.Pos()
   364  }
   365  
   366  // AddColumn adds a single column to the parquet schema. The path is provided in dotted notation. All
   367  // parent elements in this dot-separated path need to exist, otherwise the method returns an error. Any
   368  // data contained in the column store is reset.
   369  //
   370  // Deprecated: use AddColumnByPath instead. AddColumn uses '.' as separator between
   371  // path elements, which makes it impossible to address columns that contains a '.' in their name.
   372  func (fw *FileWriter) AddColumn(path string, col *Column) error {
   373  	return fw.schemaWriter.AddColumn(path, col)
   374  }
   375  
   376  // AddColumnByPath adds a single column to the parquet schema. The path is provided as ColumnPath. All
   377  // parent elements in the column path need to exist, otherwise the method returns an error. Any
   378  // data contained in the column store is reset.
   379  func (fw *FileWriter) AddColumnByPath(path ColumnPath, col *Column) error {
   380  	return fw.schemaWriter.AddColumnByPath(path, col)
   381  }
   382  
   383  // AddGroup adds a new group to the parquet schema. The provided path is written in dotted notation.
   384  // All parent elements in this dot-separated path need to exist, otherwise the method returns an error.
   385  //
   386  // Deprecated: use AddGroupByPath instead. AddGroup uses '.' as separator between
   387  // path elements, which makes it impossible to address columns that contains a '.' in their name.
   388  func (fw *FileWriter) AddGroup(path string, rep parquet.FieldRepetitionType) error {
   389  	return fw.schemaWriter.AddGroupByPath(parseColumnPath(path), rep)
   390  }
   391  
   392  // AddGroupByPath adds a new group to the parquet schema.The path is provided as ColumnPath.
   393  // All parent elements in this dot-separated path need to exist, otherwise the method returns an error.
   394  func (fw *FileWriter) AddGroupByPath(path ColumnPath, rep parquet.FieldRepetitionType) error {
   395  	return fw.schemaWriter.AddGroupByPath(path, rep)
   396  }
   397  
   398  // GetSchemaDefinition returns the schema definition that has been set in this file writer.
   399  func (fw *FileWriter) GetSchemaDefinition() *parquetschema.SchemaDefinition {
   400  	return fw.schemaWriter.GetSchemaDefinition()
   401  }
   402  
   403  // SetSchemaDefinitions sets the schema definition for this file writer.
   404  func (fw *FileWriter) SetSchemaDefinition(schemaDef *parquetschema.SchemaDefinition) error {
   405  	return fw.schemaWriter.SetSchemaDefinition(schemaDef)
   406  }
   407  
   408  // Columns returns the list of columns.
   409  func (fw *FileWriter) Columns() []*Column {
   410  	return fw.schemaWriter.Columns()
   411  }
   412  
   413  // GetColumnByName returns a column identified by name. If the column doesn't exist,
   414  // the method returns nil.
   415  //
   416  // Deprecated: use GetColumnByPath instead. GetColumnByName uses '.' as separator between
   417  // path elements, which makes it impossible to address columns that contains a '.' in their name.
   418  func (fw *FileWriter) GetColumnByName(name string) *Column {
   419  	return fw.schemaWriter.GetColumnByName(name)
   420  }
   421  
   422  // GetColumnByPath returns a column identified by its path. If the column doesn't exist,
   423  // nil is returned.
   424  func (fw *FileWriter) GetColumnByPath(path ColumnPath) *Column {
   425  	return fw.schemaWriter.GetColumnByPath(path)
   426  }