github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"encoding/binary"
    21  	"io"
    22  
    23  	"github.com/apache/arrow/go/v7/parquet"
    24  	"github.com/apache/arrow/go/v7/parquet/internal/encryption"
    25  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    26  	"github.com/apache/arrow/go/v7/parquet/metadata"
    27  	"github.com/apache/arrow/go/v7/parquet/schema"
    28  )
    29  
    30  // Writer is the primary interface for writing a parquet file
    31  type Writer struct {
    32  	sink           utils.WriteCloserTell
    33  	open           bool
    34  	props          *parquet.WriterProperties
    35  	rowGroups      int
    36  	nrows          int
    37  	metadata       metadata.FileMetaDataBuilder
    38  	fileEncryptor  encryption.FileEncryptor
    39  	rowGroupWriter *rowGroupWriter
    40  
    41  	// The Schema of this writer
    42  	Schema *schema.Schema
    43  	// The current FileMetadata to write
    44  	FileMetadata *metadata.FileMetaData
    45  	// The current keyvalue metadata
    46  	KeyValueMetadata metadata.KeyValueMetadata
    47  }
    48  
    49  type WriteOption func(*Writer)
    50  
    51  func WithWriterProps(props *parquet.WriterProperties) WriteOption {
    52  	return func(w *Writer) {
    53  		w.props = props
    54  	}
    55  }
    56  
    57  func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption {
    58  	return func(w *Writer) {
    59  		w.KeyValueMetadata = meta
    60  	}
    61  }
    62  
    63  // NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema.
    64  //
    65  // If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil,
    66  // it will be added to the file.
    67  func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer {
    68  	fileSchema := schema.NewSchema(sc)
    69  	fw := &Writer{
    70  		sink:   &utils.TellWrapper{Writer: w},
    71  		open:   true,
    72  		Schema: fileSchema,
    73  	}
    74  	for _, o := range opts {
    75  		o(fw)
    76  	}
    77  	if fw.props == nil {
    78  		fw.props = parquet.NewWriterProperties()
    79  	}
    80  	fw.metadata = *metadata.NewFileMetadataBuilder(fw.Schema, fw.props, fw.KeyValueMetadata)
    81  	fw.startFile()
    82  	return fw
    83  }
    84  
    85  // NumColumns returns the number of columns to write as defined by the schema.
    86  func (fw *Writer) NumColumns() int { return fw.Schema.NumColumns() }
    87  
    88  // NumRowGroups returns the current number of row groups that will be written for this file.
    89  func (fw *Writer) NumRowGroups() int { return fw.rowGroups }
    90  
    91  // NumRows returns the current number of rows that have be written
    92  func (fw *Writer) NumRows() int { return fw.nrows }
    93  
    94  // Properties returns the writer properties that are in use for this file.
    95  func (fw *Writer) Properties() *parquet.WriterProperties { return fw.props }
    96  
    97  // AppendBufferedRowGroup appends a rowgroup to the file and returns a writer
    98  // that buffers the row group in memory allowing writing multiple columns
    99  // at once to the row group. Data is not flushed out until the row group
   100  // is closed.
   101  //
   102  // When calling Close, all columns must have the same number of rows written.
   103  func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter {
   104  	return fw.appendRowGroup(true)
   105  }
   106  
   107  // AppendRowGroup appends a row group to the file and returns a writer
   108  // that writes columns to the row group in serial via calling NextColumn.
   109  //
   110  // When calling NextColumn, the same number of rows need to have been written
   111  // to each column before moving on. Otherwise the rowgroup writer will panic.
   112  func (fw *Writer) AppendRowGroup() SerialRowGroupWriter {
   113  	return fw.appendRowGroup(false)
   114  }
   115  
   116  func (fw *Writer) appendRowGroup(buffered bool) *rowGroupWriter {
   117  	if fw.rowGroupWriter != nil {
   118  		fw.rowGroupWriter.Close()
   119  	}
   120  	fw.rowGroups++
   121  	rgMeta := fw.metadata.AppendRowGroup()
   122  	fw.rowGroupWriter = newRowGroupWriter(fw.sink, rgMeta, int16(fw.rowGroups)-1, fw.props, buffered, fw.fileEncryptor)
   123  	return fw.rowGroupWriter
   124  }
   125  
   126  func (fw *Writer) startFile() {
   127  	encryptionProps := fw.props.FileEncryptionProperties()
   128  	magic := magicBytes
   129  	if encryptionProps != nil {
   130  		// check that all columns in columnEncryptionProperties exist in the schema
   131  		encryptedCols := encryptionProps.EncryptedColumns()
   132  		// if columnEncryptionProperties is empty, every column in the file schema will be encrypted with the footer key
   133  		if len(encryptedCols) != 0 {
   134  			colPaths := make(map[string]bool)
   135  			for i := 0; i < fw.Schema.NumColumns(); i++ {
   136  				colPaths[fw.Schema.Column(i).Path()] = true
   137  			}
   138  			for k := range encryptedCols {
   139  				if _, ok := colPaths[k]; !ok {
   140  					panic("encrypted column " + k + " not found in file schema")
   141  				}
   142  			}
   143  		}
   144  
   145  		fw.fileEncryptor = encryption.NewFileEncryptor(encryptionProps, fw.props.Allocator())
   146  		if encryptionProps.EncryptedFooter() {
   147  			magic = magicEBytes
   148  		}
   149  	}
   150  	n, err := fw.sink.Write(magic)
   151  	if n != 4 || err != nil {
   152  		panic("failed to write magic number")
   153  	}
   154  }
   155  
   156  // Close closes any open row group writer and writes the file footer. Subsequent
   157  // calls to close will have no effect.
   158  func (fw *Writer) Close() error {
   159  	if fw.open {
   160  		// if any functions here panic, we set open to be false so
   161  		// that this doesn't get called again
   162  		fw.open = false
   163  		if fw.rowGroupWriter != nil {
   164  			fw.nrows += fw.rowGroupWriter.nrows
   165  			fw.rowGroupWriter.Close()
   166  		}
   167  		fw.rowGroupWriter = nil
   168  		defer fw.sink.Close()
   169  
   170  		fileEncryptProps := fw.props.FileEncryptionProperties()
   171  		if fileEncryptProps == nil { // non encrypted file
   172  			var err error
   173  			if fw.FileMetadata, err = fw.metadata.Finish(); err != nil {
   174  				return err
   175  			}
   176  
   177  			_, err = writeFileMetadata(fw.FileMetadata, fw.sink)
   178  			return err
   179  		}
   180  
   181  		return fw.closeEncryptedFile(fileEncryptProps)
   182  	}
   183  	return nil
   184  }
   185  
   186  func (fw *Writer) closeEncryptedFile(props *parquet.FileEncryptionProperties) (err error) {
   187  	// encrypted file with encrypted footer
   188  	if props.EncryptedFooter() {
   189  		fw.FileMetadata, err = fw.metadata.Finish()
   190  		if err != nil {
   191  			return
   192  		}
   193  
   194  		footerLen := int64(0)
   195  
   196  		cryptoMetadata := fw.metadata.GetFileCryptoMetaData()
   197  		n, err := writeFileCryptoMetadata(cryptoMetadata, fw.sink)
   198  		if err != nil {
   199  			return err
   200  		}
   201  
   202  		footerLen += n
   203  		footerEncryptor := fw.fileEncryptor.GetFooterEncryptor()
   204  		n, err = writeEncryptedFileMetadata(fw.FileMetadata, fw.sink, footerEncryptor, true)
   205  		if err != nil {
   206  			return err
   207  		}
   208  		footerLen += n
   209  
   210  		if err = binary.Write(fw.sink, binary.LittleEndian, uint32(footerLen)); err != nil {
   211  			return err
   212  		}
   213  		if _, err = fw.sink.Write(magicEBytes); err != nil {
   214  			return err
   215  		}
   216  	} else {
   217  		if fw.FileMetadata, err = fw.metadata.Finish(); err != nil {
   218  			return
   219  		}
   220  		footerSigningEncryptor := fw.fileEncryptor.GetFooterSigningEncryptor()
   221  		if _, err = writeEncryptedFileMetadata(fw.FileMetadata, fw.sink, footerSigningEncryptor, false); err != nil {
   222  			return err
   223  		}
   224  	}
   225  	if fw.fileEncryptor != nil {
   226  		fw.fileEncryptor.WipeOutEncryptionKeys()
   227  	}
   228  	return nil
   229  }
   230  
   231  func writeFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer) (n int64, err error) {
   232  	n, err = fileMetadata.WriteTo(w, nil)
   233  	if err != nil {
   234  		return
   235  	}
   236  
   237  	if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil {
   238  		return
   239  	}
   240  	if _, err = w.Write(magicBytes); err != nil {
   241  		return
   242  	}
   243  	return n + int64(4+len(magicBytes)), nil
   244  }
   245  
   246  func writeEncryptedFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer, encryptor encryption.Encryptor, encryptFooter bool) (n int64, err error) {
   247  	n, err = fileMetadata.WriteTo(w, encryptor)
   248  	if encryptFooter {
   249  		return
   250  	}
   251  	if err != nil {
   252  		return
   253  	}
   254  	if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil {
   255  		return
   256  	}
   257  	if _, err = w.Write(magicBytes); err != nil {
   258  		return
   259  	}
   260  	return n + int64(4+len(magicBytes)), nil
   261  }
   262  
   263  func writeFileCryptoMetadata(crypto *metadata.FileCryptoMetadata, w io.Writer) (int64, error) {
   264  	return crypto.WriteTo(w)
   265  }