github.com/apache/arrow/go/v14@v14.0.1/parquet/file/file_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "io" 23 24 "github.com/apache/arrow/go/v14/parquet" 25 "github.com/apache/arrow/go/v14/parquet/internal/encryption" 26 "github.com/apache/arrow/go/v14/parquet/internal/utils" 27 "github.com/apache/arrow/go/v14/parquet/metadata" 28 "github.com/apache/arrow/go/v14/parquet/schema" 29 ) 30 31 // Writer is the primary interface for writing a parquet file 32 type Writer struct { 33 sink utils.WriteCloserTell 34 open bool 35 props *parquet.WriterProperties 36 rowGroups int 37 nrows int 38 metadata metadata.FileMetaDataBuilder 39 fileEncryptor encryption.FileEncryptor 40 rowGroupWriter *rowGroupWriter 41 42 // The Schema of this writer 43 Schema *schema.Schema 44 } 45 46 type writerConfig struct { 47 props *parquet.WriterProperties 48 keyValueMetadata metadata.KeyValueMetadata 49 } 50 51 type WriteOption func(*writerConfig) 52 53 func WithWriterProps(props *parquet.WriterProperties) WriteOption { 54 return func(c *writerConfig) { 55 c.props = props 56 } 57 } 58 59 func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption { 60 return func(c *writerConfig) { 61 c.keyValueMetadata = meta 62 } 63 } 64 65 // NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema. 66 // 67 // If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil, 68 // it will be added to the file. 69 func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer { 70 config := &writerConfig{} 71 for _, o := range opts { 72 o(config) 73 } 74 if config.props == nil { 75 config.props = parquet.NewWriterProperties() 76 } 77 78 fileSchema := schema.NewSchema(sc) 79 fw := &Writer{ 80 props: config.props, 81 sink: &utils.TellWrapper{Writer: w}, 82 open: true, 83 Schema: fileSchema, 84 } 85 86 fw.metadata = *metadata.NewFileMetadataBuilder(fw.Schema, fw.props, config.keyValueMetadata) 87 fw.startFile() 88 return fw 89 } 90 91 // NumColumns returns the number of columns to write as defined by the schema. 92 func (fw *Writer) NumColumns() int { return fw.Schema.NumColumns() } 93 94 // NumRowGroups returns the current number of row groups that will be written for this file. 95 func (fw *Writer) NumRowGroups() int { return fw.rowGroups } 96 97 // NumRows returns the current number of rows that have be written 98 func (fw *Writer) NumRows() int { return fw.nrows } 99 100 // Properties returns the writer properties that are in use for this file. 101 func (fw *Writer) Properties() *parquet.WriterProperties { return fw.props } 102 103 // AppendBufferedRowGroup appends a rowgroup to the file and returns a writer 104 // that buffers the row group in memory allowing writing multiple columns 105 // at once to the row group. Data is not flushed out until the row group 106 // is closed. 107 // 108 // When calling Close, all columns must have the same number of rows written. 109 func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter { 110 return fw.appendRowGroup(true) 111 } 112 113 // AppendRowGroup appends a row group to the file and returns a writer 114 // that writes columns to the row group in serial via calling NextColumn. 115 // 116 // When calling NextColumn, the same number of rows need to have been written 117 // to each column before moving on. Otherwise the rowgroup writer will panic. 118 func (fw *Writer) AppendRowGroup() SerialRowGroupWriter { 119 return fw.appendRowGroup(false) 120 } 121 122 func (fw *Writer) appendRowGroup(buffered bool) *rowGroupWriter { 123 if fw.rowGroupWriter != nil { 124 fw.rowGroupWriter.Close() 125 } 126 fw.rowGroups++ 127 rgMeta := fw.metadata.AppendRowGroup() 128 fw.rowGroupWriter = newRowGroupWriter(fw.sink, rgMeta, int16(fw.rowGroups)-1, fw.props, buffered, fw.fileEncryptor) 129 return fw.rowGroupWriter 130 } 131 132 func (fw *Writer) startFile() { 133 encryptionProps := fw.props.FileEncryptionProperties() 134 magic := magicBytes 135 if encryptionProps != nil { 136 // check that all columns in columnEncryptionProperties exist in the schema 137 encryptedCols := encryptionProps.EncryptedColumns() 138 // if columnEncryptionProperties is empty, every column in the file schema will be encrypted with the footer key 139 if len(encryptedCols) != 0 { 140 colPaths := make(map[string]bool) 141 for i := 0; i < fw.Schema.NumColumns(); i++ { 142 colPaths[fw.Schema.Column(i).Path()] = true 143 } 144 for k := range encryptedCols { 145 if _, ok := colPaths[k]; !ok { 146 panic("encrypted column " + k + " not found in file schema") 147 } 148 } 149 } 150 151 fw.fileEncryptor = encryption.NewFileEncryptor(encryptionProps, fw.props.Allocator()) 152 if encryptionProps.EncryptedFooter() { 153 magic = magicEBytes 154 } 155 } 156 n, err := fw.sink.Write(magic) 157 if n != 4 || err != nil { 158 panic("failed to write magic number") 159 } 160 } 161 162 // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata 163 func (fw *Writer) AppendKeyValueMetadata(key string, value string) error { 164 return fw.metadata.AppendKeyValueMetadata(key, value) 165 } 166 167 // Close closes any open row group writer and writes the file footer. Subsequent 168 // calls to close will have no effect. 169 func (fw *Writer) Close() (err error) { 170 if fw.open { 171 // if any functions here panic, we set open to be false so 172 // that this doesn't get called again 173 fw.open = false 174 if fw.rowGroupWriter != nil { 175 fw.nrows += fw.rowGroupWriter.nrows 176 fw.rowGroupWriter.Close() 177 } 178 fw.rowGroupWriter = nil 179 defer func() { 180 ierr := fw.sink.Close() 181 if err != nil { 182 if ierr != nil { 183 err = fmt.Errorf("error on close:%w, %s", err, ierr) 184 } 185 return 186 } 187 188 err = ierr 189 }() 190 191 fileEncryptProps := fw.props.FileEncryptionProperties() 192 if fileEncryptProps == nil { // non encrypted file 193 fileMetadata, err := fw.metadata.Finish() 194 if err != nil { 195 return err 196 } 197 198 _, err = writeFileMetadata(fileMetadata, fw.sink) 199 return err 200 } 201 202 return fw.closeEncryptedFile(fileEncryptProps) 203 } 204 return nil 205 } 206 207 func (fw *Writer) closeEncryptedFile(props *parquet.FileEncryptionProperties) error { 208 // encrypted file with encrypted footer 209 if props.EncryptedFooter() { 210 fileMetadata, err := fw.metadata.Finish() 211 if err != nil { 212 return err 213 } 214 215 footerLen := int64(0) 216 217 cryptoMetadata := fw.metadata.GetFileCryptoMetaData() 218 n, err := writeFileCryptoMetadata(cryptoMetadata, fw.sink) 219 if err != nil { 220 return err 221 } 222 223 footerLen += n 224 footerEncryptor := fw.fileEncryptor.GetFooterEncryptor() 225 n, err = writeEncryptedFileMetadata(fileMetadata, fw.sink, footerEncryptor, true) 226 if err != nil { 227 return err 228 } 229 footerLen += n 230 231 if err = binary.Write(fw.sink, binary.LittleEndian, uint32(footerLen)); err != nil { 232 return err 233 } 234 if _, err = fw.sink.Write(magicEBytes); err != nil { 235 return err 236 } 237 } else { 238 fileMetadata, err := fw.metadata.Finish() 239 if err != nil { 240 return err 241 } 242 footerSigningEncryptor := fw.fileEncryptor.GetFooterSigningEncryptor() 243 if _, err = writeEncryptedFileMetadata(fileMetadata, fw.sink, footerSigningEncryptor, false); err != nil { 244 return err 245 } 246 } 247 if fw.fileEncryptor != nil { 248 fw.fileEncryptor.WipeOutEncryptionKeys() 249 } 250 return nil 251 } 252 253 func writeFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer) (n int64, err error) { 254 n, err = fileMetadata.WriteTo(w, nil) 255 if err != nil { 256 return 257 } 258 259 if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil { 260 return 261 } 262 if _, err = w.Write(magicBytes); err != nil { 263 return 264 } 265 return n + int64(4+len(magicBytes)), nil 266 } 267 268 func writeEncryptedFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer, encryptor encryption.Encryptor, encryptFooter bool) (n int64, err error) { 269 n, err = fileMetadata.WriteTo(w, encryptor) 270 if encryptFooter { 271 return 272 } 273 if err != nil { 274 return 275 } 276 if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil { 277 return 278 } 279 if _, err = w.Write(magicBytes); err != nil { 280 return 281 } 282 return n + int64(4+len(magicBytes)), nil 283 } 284 285 func writeFileCryptoMetadata(crypto *metadata.FileCryptoMetadata, w io.Writer) (int64, error) { 286 return crypto.WriteTo(w) 287 }