github.com/apache/arrow/go/v16@v16.1.0/parquet/file/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v16/parquet"
    25  	"github.com/apache/arrow/go/v16/parquet/internal/encryption"
    26  	"github.com/apache/arrow/go/v16/parquet/internal/utils"
    27  	"github.com/apache/arrow/go/v16/parquet/metadata"
    28  	"github.com/apache/arrow/go/v16/parquet/schema"
    29  )
    30  
    31  // Writer is the primary interface for writing a parquet file
    32  type Writer struct {
    33  	sink           utils.WriteCloserTell
    34  	open           bool
    35  	footerFlushed  bool
    36  	props          *parquet.WriterProperties
    37  	rowGroups      int
    38  	nrows          int
    39  	metadata       metadata.FileMetaDataBuilder
    40  	fileEncryptor  encryption.FileEncryptor
    41  	rowGroupWriter *rowGroupWriter
    42  
    43  	// The Schema of this writer
    44  	Schema *schema.Schema
    45  }
    46  
    47  type writerConfig struct {
    48  	props            *parquet.WriterProperties
    49  	keyValueMetadata metadata.KeyValueMetadata
    50  }
    51  
    52  type WriteOption func(*writerConfig)
    53  
    54  func WithWriterProps(props *parquet.WriterProperties) WriteOption {
    55  	return func(c *writerConfig) {
    56  		c.props = props
    57  	}
    58  }
    59  
    60  func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption {
    61  	return func(c *writerConfig) {
    62  		c.keyValueMetadata = meta
    63  	}
    64  }
    65  
    66  // NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema.
    67  //
    68  // If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil,
    69  // it will be added to the file.
    70  func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer {
    71  	config := &writerConfig{}
    72  	for _, o := range opts {
    73  		o(config)
    74  	}
    75  	if config.props == nil {
    76  		config.props = parquet.NewWriterProperties()
    77  	}
    78  
    79  	fileSchema := schema.NewSchema(sc)
    80  	fw := &Writer{
    81  		props:  config.props,
    82  		sink:   &utils.TellWrapper{Writer: w},
    83  		open:   true,
    84  		Schema: fileSchema,
    85  	}
    86  
    87  	fw.metadata = *metadata.NewFileMetadataBuilder(fw.Schema, fw.props, config.keyValueMetadata)
    88  	fw.startFile()
    89  	return fw
    90  }
    91  
    92  // NumColumns returns the number of columns to write as defined by the schema.
    93  func (fw *Writer) NumColumns() int { return fw.Schema.NumColumns() }
    94  
    95  // NumRowGroups returns the current number of row groups that will be written for this file.
    96  func (fw *Writer) NumRowGroups() int { return fw.rowGroups }
    97  
    98  // NumRows returns the current number of rows that have be written
    99  func (fw *Writer) NumRows() int { return fw.nrows }
   100  
   101  // Properties returns the writer properties that are in use for this file.
   102  func (fw *Writer) Properties() *parquet.WriterProperties { return fw.props }
   103  
   104  // AppendBufferedRowGroup appends a rowgroup to the file and returns a writer
   105  // that buffers the row group in memory allowing writing multiple columns
   106  // at once to the row group. Data is not flushed out until the row group
   107  // is closed.
   108  //
   109  // When calling Close, all columns must have the same number of rows written.
   110  func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter {
   111  	return fw.appendRowGroup(true)
   112  }
   113  
   114  // AppendRowGroup appends a row group to the file and returns a writer
   115  // that writes columns to the row group in serial via calling NextColumn.
   116  //
   117  // When calling NextColumn, the same number of rows need to have been written
   118  // to each column before moving on. Otherwise the rowgroup writer will panic.
   119  func (fw *Writer) AppendRowGroup() SerialRowGroupWriter {
   120  	return fw.appendRowGroup(false)
   121  }
   122  
   123  func (fw *Writer) appendRowGroup(buffered bool) *rowGroupWriter {
   124  	if fw.rowGroupWriter != nil {
   125  		fw.nrows += fw.rowGroupWriter.nrows
   126  		fw.rowGroupWriter.Close()
   127  	}
   128  	fw.rowGroups++
   129  	fw.footerFlushed = false
   130  	rgMeta := fw.metadata.AppendRowGroup()
   131  	fw.rowGroupWriter = newRowGroupWriter(fw.sink, rgMeta, int16(fw.rowGroups)-1, fw.props, buffered, fw.fileEncryptor)
   132  	return fw.rowGroupWriter
   133  }
   134  
   135  func (fw *Writer) startFile() {
   136  	encryptionProps := fw.props.FileEncryptionProperties()
   137  	magic := magicBytes
   138  	if encryptionProps != nil {
   139  		// check that all columns in columnEncryptionProperties exist in the schema
   140  		encryptedCols := encryptionProps.EncryptedColumns()
   141  		// if columnEncryptionProperties is empty, every column in the file schema will be encrypted with the footer key
   142  		if len(encryptedCols) != 0 {
   143  			colPaths := make(map[string]bool)
   144  			for i := 0; i < fw.Schema.NumColumns(); i++ {
   145  				colPaths[fw.Schema.Column(i).Path()] = true
   146  			}
   147  			for k := range encryptedCols {
   148  				if _, ok := colPaths[k]; !ok {
   149  					panic("encrypted column " + k + " not found in file schema")
   150  				}
   151  			}
   152  		}
   153  
   154  		fw.fileEncryptor = encryption.NewFileEncryptor(encryptionProps, fw.props.Allocator())
   155  		if encryptionProps.EncryptedFooter() {
   156  			magic = magicEBytes
   157  		}
   158  	}
   159  	n, err := fw.sink.Write(magic)
   160  	if n != 4 || err != nil {
   161  		panic("failed to write magic number")
   162  	}
   163  }
   164  
   165  // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata
   166  func (fw *Writer) AppendKeyValueMetadata(key string, value string) error {
   167  	return fw.metadata.AppendKeyValueMetadata(key, value)
   168  }
   169  
   170  // Close closes any open row group writer and writes the file footer. Subsequent
   171  // calls to close will have no effect.
   172  func (fw *Writer) Close() (err error) {
   173  	if fw.open {
   174  		// if any functions here panic, we set open to be false so
   175  		// that this doesn't get called again
   176  		fw.open = false
   177  
   178  		defer func() {
   179  			fw.closeEncryptor()
   180  			ierr := fw.sink.Close()
   181  			if err != nil {
   182  				if ierr != nil {
   183  					err = fmt.Errorf("error on close:%w, %s", err, ierr)
   184  				}
   185  				return
   186  			}
   187  
   188  			err = ierr
   189  		}()
   190  
   191  		err = fw.FlushWithFooter()
   192  		fw.metadata.Clear()
   193  	}
   194  	return nil
   195  }
   196  
   197  // FlushWithFooter closes any open row group writer and writes the file footer, leaving
   198  // the writer open for additional row groups.  Additional footers written by later
   199  // calls to FlushWithFooter or Close will be cumulative, so that only the last footer
   200  // written need ever be read by a reader.
   201  func (fw *Writer) FlushWithFooter() error {
   202  	if !fw.footerFlushed {
   203  		if fw.rowGroupWriter != nil {
   204  			fw.nrows += fw.rowGroupWriter.nrows
   205  			fw.rowGroupWriter.Close()
   206  		}
   207  		fw.rowGroupWriter = nil
   208  
   209  		fileMetadata, err := fw.metadata.Snapshot()
   210  		if err != nil {
   211  			return err
   212  		}
   213  
   214  		fileEncryptProps := fw.props.FileEncryptionProperties()
   215  		if fileEncryptProps == nil { // non encrypted file
   216  			if _, err = writeFileMetadata(fileMetadata, fw.sink); err != nil {
   217  				return err
   218  			}
   219  		} else {
   220  			if err := fw.flushEncryptedFile(fileMetadata, fileEncryptProps); err != nil {
   221  				return err
   222  			}
   223  		}
   224  
   225  		fw.footerFlushed = true
   226  	}
   227  	return nil
   228  }
   229  
   230  func (fw *Writer) flushEncryptedFile(fileMetadata *metadata.FileMetaData, props *parquet.FileEncryptionProperties) error {
   231  	// encrypted file with encrypted footer
   232  	if props.EncryptedFooter() {
   233  		footerLen := int64(0)
   234  
   235  		cryptoMetadata := fw.metadata.GetFileCryptoMetaData()
   236  		n, err := writeFileCryptoMetadata(cryptoMetadata, fw.sink)
   237  		if err != nil {
   238  			return err
   239  		}
   240  
   241  		footerLen += n
   242  		footerEncryptor := fw.fileEncryptor.GetFooterEncryptor()
   243  		n, err = writeEncryptedFileMetadata(fileMetadata, fw.sink, footerEncryptor, true)
   244  		if err != nil {
   245  			return err
   246  		}
   247  		footerLen += n
   248  
   249  		if err = binary.Write(fw.sink, binary.LittleEndian, uint32(footerLen)); err != nil {
   250  			return err
   251  		}
   252  		if _, err = fw.sink.Write(magicEBytes); err != nil {
   253  			return err
   254  		}
   255  	} else {
   256  		footerSigningEncryptor := fw.fileEncryptor.GetFooterSigningEncryptor()
   257  		if _, err := writeEncryptedFileMetadata(fileMetadata, fw.sink, footerSigningEncryptor, false); err != nil {
   258  			return err
   259  		}
   260  	}
   261  	return nil
   262  }
   263  
   264  func (fw *Writer) closeEncryptor() {
   265  	if fw.fileEncryptor != nil {
   266  		fw.fileEncryptor.WipeOutEncryptionKeys()
   267  	}
   268  }
   269  
   270  func writeFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer) (n int64, err error) {
   271  	n, err = fileMetadata.WriteTo(w, nil)
   272  	if err != nil {
   273  		return
   274  	}
   275  
   276  	if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil {
   277  		return
   278  	}
   279  	if _, err = w.Write(magicBytes); err != nil {
   280  		return
   281  	}
   282  	return n + int64(4+len(magicBytes)), nil
   283  }
   284  
   285  func writeEncryptedFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer, encryptor encryption.Encryptor, encryptFooter bool) (n int64, err error) {
   286  	n, err = fileMetadata.WriteTo(w, encryptor)
   287  	if encryptFooter {
   288  		return
   289  	}
   290  	if err != nil {
   291  		return
   292  	}
   293  	if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil {
   294  		return
   295  	}
   296  	if _, err = w.Write(magicBytes); err != nil {
   297  		return
   298  	}
   299  	return n + int64(4+len(magicBytes)), nil
   300  }
   301  
   302  func writeFileCryptoMetadata(crypto *metadata.FileCryptoMetadata, w io.Writer) (int64, error) {
   303  	return crypto.WriteTo(w)
   304  }