github.com/apache/arrow/go/v14@v14.0.2/parquet/file/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"encoding/binary"
    21  	"fmt"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v14/parquet"
    25  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    26  	"github.com/apache/arrow/go/v14/parquet/internal/utils"
    27  	"github.com/apache/arrow/go/v14/parquet/metadata"
    28  	"github.com/apache/arrow/go/v14/parquet/schema"
    29  )
    30  
    31  // Writer is the primary interface for writing a parquet file
    32  type Writer struct {
    33  	sink           utils.WriteCloserTell
    34  	open           bool
    35  	props          *parquet.WriterProperties
    36  	rowGroups      int
    37  	nrows          int
    38  	metadata       metadata.FileMetaDataBuilder
    39  	fileEncryptor  encryption.FileEncryptor
    40  	rowGroupWriter *rowGroupWriter
    41  
    42  	// The Schema of this writer
    43  	Schema *schema.Schema
    44  }
    45  
    46  type writerConfig struct {
    47  	props            *parquet.WriterProperties
    48  	keyValueMetadata metadata.KeyValueMetadata
    49  }
    50  
    51  type WriteOption func(*writerConfig)
    52  
    53  func WithWriterProps(props *parquet.WriterProperties) WriteOption {
    54  	return func(c *writerConfig) {
    55  		c.props = props
    56  	}
    57  }
    58  
    59  func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption {
    60  	return func(c *writerConfig) {
    61  		c.keyValueMetadata = meta
    62  	}
    63  }
    64  
    65  // NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema.
    66  //
    67  // If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil,
    68  // it will be added to the file.
    69  func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer {
    70  	config := &writerConfig{}
    71  	for _, o := range opts {
    72  		o(config)
    73  	}
    74  	if config.props == nil {
    75  		config.props = parquet.NewWriterProperties()
    76  	}
    77  
    78  	fileSchema := schema.NewSchema(sc)
    79  	fw := &Writer{
    80  		props:  config.props,
    81  		sink:   &utils.TellWrapper{Writer: w},
    82  		open:   true,
    83  		Schema: fileSchema,
    84  	}
    85  
    86  	fw.metadata = *metadata.NewFileMetadataBuilder(fw.Schema, fw.props, config.keyValueMetadata)
    87  	fw.startFile()
    88  	return fw
    89  }
    90  
    91  // NumColumns returns the number of columns to write as defined by the schema.
    92  func (fw *Writer) NumColumns() int { return fw.Schema.NumColumns() }
    93  
    94  // NumRowGroups returns the current number of row groups that will be written for this file.
    95  func (fw *Writer) NumRowGroups() int { return fw.rowGroups }
    96  
    97  // NumRows returns the current number of rows that have be written
    98  func (fw *Writer) NumRows() int { return fw.nrows }
    99  
   100  // Properties returns the writer properties that are in use for this file.
   101  func (fw *Writer) Properties() *parquet.WriterProperties { return fw.props }
   102  
   103  // AppendBufferedRowGroup appends a rowgroup to the file and returns a writer
   104  // that buffers the row group in memory allowing writing multiple columns
   105  // at once to the row group. Data is not flushed out until the row group
   106  // is closed.
   107  //
   108  // When calling Close, all columns must have the same number of rows written.
   109  func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter {
   110  	return fw.appendRowGroup(true)
   111  }
   112  
   113  // AppendRowGroup appends a row group to the file and returns a writer
   114  // that writes columns to the row group in serial via calling NextColumn.
   115  //
   116  // When calling NextColumn, the same number of rows need to have been written
   117  // to each column before moving on. Otherwise the rowgroup writer will panic.
   118  func (fw *Writer) AppendRowGroup() SerialRowGroupWriter {
   119  	return fw.appendRowGroup(false)
   120  }
   121  
   122  func (fw *Writer) appendRowGroup(buffered bool) *rowGroupWriter {
   123  	if fw.rowGroupWriter != nil {
   124  		fw.rowGroupWriter.Close()
   125  	}
   126  	fw.rowGroups++
   127  	rgMeta := fw.metadata.AppendRowGroup()
   128  	fw.rowGroupWriter = newRowGroupWriter(fw.sink, rgMeta, int16(fw.rowGroups)-1, fw.props, buffered, fw.fileEncryptor)
   129  	return fw.rowGroupWriter
   130  }
   131  
   132  func (fw *Writer) startFile() {
   133  	encryptionProps := fw.props.FileEncryptionProperties()
   134  	magic := magicBytes
   135  	if encryptionProps != nil {
   136  		// check that all columns in columnEncryptionProperties exist in the schema
   137  		encryptedCols := encryptionProps.EncryptedColumns()
   138  		// if columnEncryptionProperties is empty, every column in the file schema will be encrypted with the footer key
   139  		if len(encryptedCols) != 0 {
   140  			colPaths := make(map[string]bool)
   141  			for i := 0; i < fw.Schema.NumColumns(); i++ {
   142  				colPaths[fw.Schema.Column(i).Path()] = true
   143  			}
   144  			for k := range encryptedCols {
   145  				if _, ok := colPaths[k]; !ok {
   146  					panic("encrypted column " + k + " not found in file schema")
   147  				}
   148  			}
   149  		}
   150  
   151  		fw.fileEncryptor = encryption.NewFileEncryptor(encryptionProps, fw.props.Allocator())
   152  		if encryptionProps.EncryptedFooter() {
   153  			magic = magicEBytes
   154  		}
   155  	}
   156  	n, err := fw.sink.Write(magic)
   157  	if n != 4 || err != nil {
   158  		panic("failed to write magic number")
   159  	}
   160  }
   161  
   162  // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata
   163  func (fw *Writer) AppendKeyValueMetadata(key string, value string) error {
   164  	return fw.metadata.AppendKeyValueMetadata(key, value)
   165  }
   166  
   167  // Close closes any open row group writer and writes the file footer. Subsequent
   168  // calls to close will have no effect.
   169  func (fw *Writer) Close() (err error) {
   170  	if fw.open {
   171  		// if any functions here panic, we set open to be false so
   172  		// that this doesn't get called again
   173  		fw.open = false
   174  		if fw.rowGroupWriter != nil {
   175  			fw.nrows += fw.rowGroupWriter.nrows
   176  			fw.rowGroupWriter.Close()
   177  		}
   178  		fw.rowGroupWriter = nil
   179  		defer func() {
   180  			ierr := fw.sink.Close()
   181  			if err != nil {
   182  				if ierr != nil {
   183  					err = fmt.Errorf("error on close:%w, %s", err, ierr)
   184  				}
   185  				return
   186  			}
   187  
   188  			err = ierr
   189  		}()
   190  
   191  		fileEncryptProps := fw.props.FileEncryptionProperties()
   192  		if fileEncryptProps == nil { // non encrypted file
   193  			fileMetadata, err := fw.metadata.Finish()
   194  			if err != nil {
   195  				return err
   196  			}
   197  
   198  			_, err = writeFileMetadata(fileMetadata, fw.sink)
   199  			return err
   200  		}
   201  
   202  		return fw.closeEncryptedFile(fileEncryptProps)
   203  	}
   204  	return nil
   205  }
   206  
   207  func (fw *Writer) closeEncryptedFile(props *parquet.FileEncryptionProperties) error {
   208  	// encrypted file with encrypted footer
   209  	if props.EncryptedFooter() {
   210  		fileMetadata, err := fw.metadata.Finish()
   211  		if err != nil {
   212  			return err
   213  		}
   214  
   215  		footerLen := int64(0)
   216  
   217  		cryptoMetadata := fw.metadata.GetFileCryptoMetaData()
   218  		n, err := writeFileCryptoMetadata(cryptoMetadata, fw.sink)
   219  		if err != nil {
   220  			return err
   221  		}
   222  
   223  		footerLen += n
   224  		footerEncryptor := fw.fileEncryptor.GetFooterEncryptor()
   225  		n, err = writeEncryptedFileMetadata(fileMetadata, fw.sink, footerEncryptor, true)
   226  		if err != nil {
   227  			return err
   228  		}
   229  		footerLen += n
   230  
   231  		if err = binary.Write(fw.sink, binary.LittleEndian, uint32(footerLen)); err != nil {
   232  			return err
   233  		}
   234  		if _, err = fw.sink.Write(magicEBytes); err != nil {
   235  			return err
   236  		}
   237  	} else {
   238  		fileMetadata, err := fw.metadata.Finish()
   239  		if err != nil {
   240  			return err
   241  		}
   242  		footerSigningEncryptor := fw.fileEncryptor.GetFooterSigningEncryptor()
   243  		if _, err = writeEncryptedFileMetadata(fileMetadata, fw.sink, footerSigningEncryptor, false); err != nil {
   244  			return err
   245  		}
   246  	}
   247  	if fw.fileEncryptor != nil {
   248  		fw.fileEncryptor.WipeOutEncryptionKeys()
   249  	}
   250  	return nil
   251  }
   252  
   253  func writeFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer) (n int64, err error) {
   254  	n, err = fileMetadata.WriteTo(w, nil)
   255  	if err != nil {
   256  		return
   257  	}
   258  
   259  	if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil {
   260  		return
   261  	}
   262  	if _, err = w.Write(magicBytes); err != nil {
   263  		return
   264  	}
   265  	return n + int64(4+len(magicBytes)), nil
   266  }
   267  
   268  func writeEncryptedFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer, encryptor encryption.Encryptor, encryptFooter bool) (n int64, err error) {
   269  	n, err = fileMetadata.WriteTo(w, encryptor)
   270  	if encryptFooter {
   271  		return
   272  	}
   273  	if err != nil {
   274  		return
   275  	}
   276  	if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil {
   277  		return
   278  	}
   279  	if _, err = w.Write(magicBytes); err != nil {
   280  		return
   281  	}
   282  	return n + int64(4+len(magicBytes)), nil
   283  }
   284  
   285  func writeFileCryptoMetadata(crypto *metadata.FileCryptoMetadata, w io.Writer) (int64, error) {
   286  	return crypto.WriteTo(w)
   287  }