github.com/apache/arrow/go/v16@v16.1.0/parquet/file/file_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "io" 23 24 "github.com/apache/arrow/go/v16/parquet" 25 "github.com/apache/arrow/go/v16/parquet/internal/encryption" 26 "github.com/apache/arrow/go/v16/parquet/internal/utils" 27 "github.com/apache/arrow/go/v16/parquet/metadata" 28 "github.com/apache/arrow/go/v16/parquet/schema" 29 ) 30 31 // Writer is the primary interface for writing a parquet file 32 type Writer struct { 33 sink utils.WriteCloserTell 34 open bool 35 footerFlushed bool 36 props *parquet.WriterProperties 37 rowGroups int 38 nrows int 39 metadata metadata.FileMetaDataBuilder 40 fileEncryptor encryption.FileEncryptor 41 rowGroupWriter *rowGroupWriter 42 43 // The Schema of this writer 44 Schema *schema.Schema 45 } 46 47 type writerConfig struct { 48 props *parquet.WriterProperties 49 keyValueMetadata metadata.KeyValueMetadata 50 } 51 52 type WriteOption func(*writerConfig) 53 54 func WithWriterProps(props *parquet.WriterProperties) WriteOption { 55 return func(c *writerConfig) { 56 c.props = props 57 } 58 } 59 60 func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption { 61 return func(c *writerConfig) { 62 c.keyValueMetadata = meta 63 } 64 } 65 66 // NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema. 67 // 68 // If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil, 69 // it will be added to the file. 70 func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer { 71 config := &writerConfig{} 72 for _, o := range opts { 73 o(config) 74 } 75 if config.props == nil { 76 config.props = parquet.NewWriterProperties() 77 } 78 79 fileSchema := schema.NewSchema(sc) 80 fw := &Writer{ 81 props: config.props, 82 sink: &utils.TellWrapper{Writer: w}, 83 open: true, 84 Schema: fileSchema, 85 } 86 87 fw.metadata = *metadata.NewFileMetadataBuilder(fw.Schema, fw.props, config.keyValueMetadata) 88 fw.startFile() 89 return fw 90 } 91 92 // NumColumns returns the number of columns to write as defined by the schema. 93 func (fw *Writer) NumColumns() int { return fw.Schema.NumColumns() } 94 95 // NumRowGroups returns the current number of row groups that will be written for this file. 96 func (fw *Writer) NumRowGroups() int { return fw.rowGroups } 97 98 // NumRows returns the current number of rows that have be written 99 func (fw *Writer) NumRows() int { return fw.nrows } 100 101 // Properties returns the writer properties that are in use for this file. 102 func (fw *Writer) Properties() *parquet.WriterProperties { return fw.props } 103 104 // AppendBufferedRowGroup appends a rowgroup to the file and returns a writer 105 // that buffers the row group in memory allowing writing multiple columns 106 // at once to the row group. Data is not flushed out until the row group 107 // is closed. 108 // 109 // When calling Close, all columns must have the same number of rows written. 110 func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter { 111 return fw.appendRowGroup(true) 112 } 113 114 // AppendRowGroup appends a row group to the file and returns a writer 115 // that writes columns to the row group in serial via calling NextColumn. 116 // 117 // When calling NextColumn, the same number of rows need to have been written 118 // to each column before moving on. Otherwise the rowgroup writer will panic. 119 func (fw *Writer) AppendRowGroup() SerialRowGroupWriter { 120 return fw.appendRowGroup(false) 121 } 122 123 func (fw *Writer) appendRowGroup(buffered bool) *rowGroupWriter { 124 if fw.rowGroupWriter != nil { 125 fw.nrows += fw.rowGroupWriter.nrows 126 fw.rowGroupWriter.Close() 127 } 128 fw.rowGroups++ 129 fw.footerFlushed = false 130 rgMeta := fw.metadata.AppendRowGroup() 131 fw.rowGroupWriter = newRowGroupWriter(fw.sink, rgMeta, int16(fw.rowGroups)-1, fw.props, buffered, fw.fileEncryptor) 132 return fw.rowGroupWriter 133 } 134 135 func (fw *Writer) startFile() { 136 encryptionProps := fw.props.FileEncryptionProperties() 137 magic := magicBytes 138 if encryptionProps != nil { 139 // check that all columns in columnEncryptionProperties exist in the schema 140 encryptedCols := encryptionProps.EncryptedColumns() 141 // if columnEncryptionProperties is empty, every column in the file schema will be encrypted with the footer key 142 if len(encryptedCols) != 0 { 143 colPaths := make(map[string]bool) 144 for i := 0; i < fw.Schema.NumColumns(); i++ { 145 colPaths[fw.Schema.Column(i).Path()] = true 146 } 147 for k := range encryptedCols { 148 if _, ok := colPaths[k]; !ok { 149 panic("encrypted column " + k + " not found in file schema") 150 } 151 } 152 } 153 154 fw.fileEncryptor = encryption.NewFileEncryptor(encryptionProps, fw.props.Allocator()) 155 if encryptionProps.EncryptedFooter() { 156 magic = magicEBytes 157 } 158 } 159 n, err := fw.sink.Write(magic) 160 if n != 4 || err != nil { 161 panic("failed to write magic number") 162 } 163 } 164 165 // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata 166 func (fw *Writer) AppendKeyValueMetadata(key string, value string) error { 167 return fw.metadata.AppendKeyValueMetadata(key, value) 168 } 169 170 // Close closes any open row group writer and writes the file footer. Subsequent 171 // calls to close will have no effect. 172 func (fw *Writer) Close() (err error) { 173 if fw.open { 174 // if any functions here panic, we set open to be false so 175 // that this doesn't get called again 176 fw.open = false 177 178 defer func() { 179 fw.closeEncryptor() 180 ierr := fw.sink.Close() 181 if err != nil { 182 if ierr != nil { 183 err = fmt.Errorf("error on close:%w, %s", err, ierr) 184 } 185 return 186 } 187 188 err = ierr 189 }() 190 191 err = fw.FlushWithFooter() 192 fw.metadata.Clear() 193 } 194 return nil 195 } 196 197 // FlushWithFooter closes any open row group writer and writes the file footer, leaving 198 // the writer open for additional row groups. Additional footers written by later 199 // calls to FlushWithFooter or Close will be cumulative, so that only the last footer 200 // written need ever be read by a reader. 201 func (fw *Writer) FlushWithFooter() error { 202 if !fw.footerFlushed { 203 if fw.rowGroupWriter != nil { 204 fw.nrows += fw.rowGroupWriter.nrows 205 fw.rowGroupWriter.Close() 206 } 207 fw.rowGroupWriter = nil 208 209 fileMetadata, err := fw.metadata.Snapshot() 210 if err != nil { 211 return err 212 } 213 214 fileEncryptProps := fw.props.FileEncryptionProperties() 215 if fileEncryptProps == nil { // non encrypted file 216 if _, err = writeFileMetadata(fileMetadata, fw.sink); err != nil { 217 return err 218 } 219 } else { 220 if err := fw.flushEncryptedFile(fileMetadata, fileEncryptProps); err != nil { 221 return err 222 } 223 } 224 225 fw.footerFlushed = true 226 } 227 return nil 228 } 229 230 func (fw *Writer) flushEncryptedFile(fileMetadata *metadata.FileMetaData, props *parquet.FileEncryptionProperties) error { 231 // encrypted file with encrypted footer 232 if props.EncryptedFooter() { 233 footerLen := int64(0) 234 235 cryptoMetadata := fw.metadata.GetFileCryptoMetaData() 236 n, err := writeFileCryptoMetadata(cryptoMetadata, fw.sink) 237 if err != nil { 238 return err 239 } 240 241 footerLen += n 242 footerEncryptor := fw.fileEncryptor.GetFooterEncryptor() 243 n, err = writeEncryptedFileMetadata(fileMetadata, fw.sink, footerEncryptor, true) 244 if err != nil { 245 return err 246 } 247 footerLen += n 248 249 if err = binary.Write(fw.sink, binary.LittleEndian, uint32(footerLen)); err != nil { 250 return err 251 } 252 if _, err = fw.sink.Write(magicEBytes); err != nil { 253 return err 254 } 255 } else { 256 footerSigningEncryptor := fw.fileEncryptor.GetFooterSigningEncryptor() 257 if _, err := writeEncryptedFileMetadata(fileMetadata, fw.sink, footerSigningEncryptor, false); err != nil { 258 return err 259 } 260 } 261 return nil 262 } 263 264 func (fw *Writer) closeEncryptor() { 265 if fw.fileEncryptor != nil { 266 fw.fileEncryptor.WipeOutEncryptionKeys() 267 } 268 } 269 270 func writeFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer) (n int64, err error) { 271 n, err = fileMetadata.WriteTo(w, nil) 272 if err != nil { 273 return 274 } 275 276 if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil { 277 return 278 } 279 if _, err = w.Write(magicBytes); err != nil { 280 return 281 } 282 return n + int64(4+len(magicBytes)), nil 283 } 284 285 func writeEncryptedFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer, encryptor encryption.Encryptor, encryptFooter bool) (n int64, err error) { 286 n, err = fileMetadata.WriteTo(w, encryptor) 287 if encryptFooter { 288 return 289 } 290 if err != nil { 291 return 292 } 293 if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil { 294 return 295 } 296 if _, err = w.Write(magicBytes); err != nil { 297 return 298 } 299 return n + int64(4+len(magicBytes)), nil 300 } 301 302 func writeFileCryptoMetadata(crypto *metadata.FileCryptoMetadata, w io.Writer) (int64, error) { 303 return crypto.WriteTo(w) 304 }