github.com/apache/arrow/go/v7@v7.0.1/parquet/file/file_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "encoding/binary" 21 "io" 22 23 "github.com/apache/arrow/go/v7/parquet" 24 "github.com/apache/arrow/go/v7/parquet/internal/encryption" 25 "github.com/apache/arrow/go/v7/parquet/internal/utils" 26 "github.com/apache/arrow/go/v7/parquet/metadata" 27 "github.com/apache/arrow/go/v7/parquet/schema" 28 ) 29 30 // Writer is the primary interface for writing a parquet file 31 type Writer struct { 32 sink utils.WriteCloserTell 33 open bool 34 props *parquet.WriterProperties 35 rowGroups int 36 nrows int 37 metadata metadata.FileMetaDataBuilder 38 fileEncryptor encryption.FileEncryptor 39 rowGroupWriter *rowGroupWriter 40 41 // The Schema of this writer 42 Schema *schema.Schema 43 // The current FileMetadata to write 44 FileMetadata *metadata.FileMetaData 45 // The current keyvalue metadata 46 KeyValueMetadata metadata.KeyValueMetadata 47 } 48 49 type WriteOption func(*Writer) 50 51 func WithWriterProps(props *parquet.WriterProperties) WriteOption { 52 return func(w *Writer) { 53 w.props = props 54 } 55 } 56 57 func WithWriteMetadata(meta metadata.KeyValueMetadata) WriteOption { 58 return func(w *Writer) { 59 w.KeyValueMetadata = meta 60 } 61 } 62 63 // NewParquetWriter returns a Writer that writes to the provided WriteSeeker with the given schema. 64 // 65 // If props is nil, then the default Writer Properties will be used. If the key value metadata is not nil, 66 // it will be added to the file. 67 func NewParquetWriter(w io.Writer, sc *schema.GroupNode, opts ...WriteOption) *Writer { 68 fileSchema := schema.NewSchema(sc) 69 fw := &Writer{ 70 sink: &utils.TellWrapper{Writer: w}, 71 open: true, 72 Schema: fileSchema, 73 } 74 for _, o := range opts { 75 o(fw) 76 } 77 if fw.props == nil { 78 fw.props = parquet.NewWriterProperties() 79 } 80 fw.metadata = *metadata.NewFileMetadataBuilder(fw.Schema, fw.props, fw.KeyValueMetadata) 81 fw.startFile() 82 return fw 83 } 84 85 // NumColumns returns the number of columns to write as defined by the schema. 86 func (fw *Writer) NumColumns() int { return fw.Schema.NumColumns() } 87 88 // NumRowGroups returns the current number of row groups that will be written for this file. 89 func (fw *Writer) NumRowGroups() int { return fw.rowGroups } 90 91 // NumRows returns the current number of rows that have be written 92 func (fw *Writer) NumRows() int { return fw.nrows } 93 94 // Properties returns the writer properties that are in use for this file. 95 func (fw *Writer) Properties() *parquet.WriterProperties { return fw.props } 96 97 // AppendBufferedRowGroup appends a rowgroup to the file and returns a writer 98 // that buffers the row group in memory allowing writing multiple columns 99 // at once to the row group. Data is not flushed out until the row group 100 // is closed. 101 // 102 // When calling Close, all columns must have the same number of rows written. 103 func (fw *Writer) AppendBufferedRowGroup() BufferedRowGroupWriter { 104 return fw.appendRowGroup(true) 105 } 106 107 // AppendRowGroup appends a row group to the file and returns a writer 108 // that writes columns to the row group in serial via calling NextColumn. 109 // 110 // When calling NextColumn, the same number of rows need to have been written 111 // to each column before moving on. Otherwise the rowgroup writer will panic. 112 func (fw *Writer) AppendRowGroup() SerialRowGroupWriter { 113 return fw.appendRowGroup(false) 114 } 115 116 func (fw *Writer) appendRowGroup(buffered bool) *rowGroupWriter { 117 if fw.rowGroupWriter != nil { 118 fw.rowGroupWriter.Close() 119 } 120 fw.rowGroups++ 121 rgMeta := fw.metadata.AppendRowGroup() 122 fw.rowGroupWriter = newRowGroupWriter(fw.sink, rgMeta, int16(fw.rowGroups)-1, fw.props, buffered, fw.fileEncryptor) 123 return fw.rowGroupWriter 124 } 125 126 func (fw *Writer) startFile() { 127 encryptionProps := fw.props.FileEncryptionProperties() 128 magic := magicBytes 129 if encryptionProps != nil { 130 // check that all columns in columnEncryptionProperties exist in the schema 131 encryptedCols := encryptionProps.EncryptedColumns() 132 // if columnEncryptionProperties is empty, every column in the file schema will be encrypted with the footer key 133 if len(encryptedCols) != 0 { 134 colPaths := make(map[string]bool) 135 for i := 0; i < fw.Schema.NumColumns(); i++ { 136 colPaths[fw.Schema.Column(i).Path()] = true 137 } 138 for k := range encryptedCols { 139 if _, ok := colPaths[k]; !ok { 140 panic("encrypted column " + k + " not found in file schema") 141 } 142 } 143 } 144 145 fw.fileEncryptor = encryption.NewFileEncryptor(encryptionProps, fw.props.Allocator()) 146 if encryptionProps.EncryptedFooter() { 147 magic = magicEBytes 148 } 149 } 150 n, err := fw.sink.Write(magic) 151 if n != 4 || err != nil { 152 panic("failed to write magic number") 153 } 154 } 155 156 // Close closes any open row group writer and writes the file footer. Subsequent 157 // calls to close will have no effect. 158 func (fw *Writer) Close() error { 159 if fw.open { 160 // if any functions here panic, we set open to be false so 161 // that this doesn't get called again 162 fw.open = false 163 if fw.rowGroupWriter != nil { 164 fw.nrows += fw.rowGroupWriter.nrows 165 fw.rowGroupWriter.Close() 166 } 167 fw.rowGroupWriter = nil 168 defer fw.sink.Close() 169 170 fileEncryptProps := fw.props.FileEncryptionProperties() 171 if fileEncryptProps == nil { // non encrypted file 172 var err error 173 if fw.FileMetadata, err = fw.metadata.Finish(); err != nil { 174 return err 175 } 176 177 _, err = writeFileMetadata(fw.FileMetadata, fw.sink) 178 return err 179 } 180 181 return fw.closeEncryptedFile(fileEncryptProps) 182 } 183 return nil 184 } 185 186 func (fw *Writer) closeEncryptedFile(props *parquet.FileEncryptionProperties) (err error) { 187 // encrypted file with encrypted footer 188 if props.EncryptedFooter() { 189 fw.FileMetadata, err = fw.metadata.Finish() 190 if err != nil { 191 return 192 } 193 194 footerLen := int64(0) 195 196 cryptoMetadata := fw.metadata.GetFileCryptoMetaData() 197 n, err := writeFileCryptoMetadata(cryptoMetadata, fw.sink) 198 if err != nil { 199 return err 200 } 201 202 footerLen += n 203 footerEncryptor := fw.fileEncryptor.GetFooterEncryptor() 204 n, err = writeEncryptedFileMetadata(fw.FileMetadata, fw.sink, footerEncryptor, true) 205 if err != nil { 206 return err 207 } 208 footerLen += n 209 210 if err = binary.Write(fw.sink, binary.LittleEndian, uint32(footerLen)); err != nil { 211 return err 212 } 213 if _, err = fw.sink.Write(magicEBytes); err != nil { 214 return err 215 } 216 } else { 217 if fw.FileMetadata, err = fw.metadata.Finish(); err != nil { 218 return 219 } 220 footerSigningEncryptor := fw.fileEncryptor.GetFooterSigningEncryptor() 221 if _, err = writeEncryptedFileMetadata(fw.FileMetadata, fw.sink, footerSigningEncryptor, false); err != nil { 222 return err 223 } 224 } 225 if fw.fileEncryptor != nil { 226 fw.fileEncryptor.WipeOutEncryptionKeys() 227 } 228 return nil 229 } 230 231 func writeFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer) (n int64, err error) { 232 n, err = fileMetadata.WriteTo(w, nil) 233 if err != nil { 234 return 235 } 236 237 if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil { 238 return 239 } 240 if _, err = w.Write(magicBytes); err != nil { 241 return 242 } 243 return n + int64(4+len(magicBytes)), nil 244 } 245 246 func writeEncryptedFileMetadata(fileMetadata *metadata.FileMetaData, w io.Writer, encryptor encryption.Encryptor, encryptFooter bool) (n int64, err error) { 247 n, err = fileMetadata.WriteTo(w, encryptor) 248 if encryptFooter { 249 return 250 } 251 if err != nil { 252 return 253 } 254 if err = binary.Write(w, binary.LittleEndian, uint32(n)); err != nil { 255 return 256 } 257 if _, err = w.Write(magicBytes); err != nil { 258 return 259 } 260 return n + int64(4+len(magicBytes)), nil 261 } 262 263 func writeFileCryptoMetadata(crypto *metadata.FileCryptoMetadata, w io.Writer) (int64, error) { 264 return crypto.WriteTo(w) 265 }