github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/file_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "context" 21 "encoding/base64" 22 "fmt" 23 "io" 24 25 "github.com/apache/arrow/go/v14/arrow" 26 "github.com/apache/arrow/go/v14/arrow/flight" 27 "github.com/apache/arrow/go/v14/internal/utils" 28 "github.com/apache/arrow/go/v14/parquet" 29 "github.com/apache/arrow/go/v14/parquet/file" 30 "github.com/apache/arrow/go/v14/parquet/metadata" 31 "golang.org/x/xerrors" 32 ) 33 34 // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema 35 // and columns will be determined by the schema of the table, writing the file out to the provided writer. 36 // The chunksize will be utilized in order to determine the size of the row groups. 37 func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error { 38 writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops) 39 if err != nil { 40 return err 41 } 42 43 if err := writer.WriteTable(tbl, chunkSize); err != nil { 44 return err 45 } 46 47 return writer.Close() 48 } 49 50 // FileWriter is an object for writing Arrow directly to a parquet file. 51 type FileWriter struct { 52 wr *file.Writer 53 schema *arrow.Schema 54 manifest *SchemaManifest 55 rgw file.RowGroupWriter 56 arrowProps ArrowWriterProperties 57 ctx context.Context 58 colIdx int 59 closed bool 60 } 61 62 // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than 63 // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing 64 // file.Writer, this will create a new file.Writer based on the schema provided. 65 func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) { 66 if props == nil { 67 props = parquet.NewWriterProperties() 68 } 69 70 pqschema, err := ToParquet(arrschema, props, arrprops) 71 if err != nil { 72 return nil, err 73 } 74 75 meta := make(metadata.KeyValueMetadata, 0) 76 for i := 0; i < arrschema.Metadata().Len(); i++ { 77 meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i]) 78 } 79 80 if arrprops.storeSchema { 81 serializedSchema := flight.SerializeSchema(arrschema, props.Allocator()) 82 meta.Append("ARROW:schema", base64.StdEncoding.EncodeToString(serializedSchema)) 83 } 84 85 schemaNode := pqschema.Root() 86 baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta)) 87 88 manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{}) 89 if err != nil { 90 return nil, err 91 } 92 93 return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil 94 } 95 96 // NewRowGroup does what it says on the tin, creates a new row group in the underlying file. 97 // Equivalent to `AppendRowGroup` on a file.Writer 98 func (fw *FileWriter) NewRowGroup() { 99 if fw.rgw != nil { 100 fw.rgw.Close() 101 } 102 fw.rgw = fw.wr.AppendRowGroup() 103 fw.colIdx = 0 104 } 105 106 // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records 107 // without immediately flushing them to disk. This allows using WriteBuffered to write records 108 // and decide where to break your row group based on the TotalBytesWritten rather than on the max 109 // row group len. If using Records, this should be paired with WriteBuffered, while 110 // Write will always write a new record as a row group in and of itself. 111 func (fw *FileWriter) NewBufferedRowGroup() { 112 if fw.rgw != nil { 113 fw.rgw.Close() 114 } 115 fw.rgw = fw.wr.AppendBufferedRowGroup() 116 fw.colIdx = 0 117 } 118 119 // RowGroupTotalCompressedBytes returns the total number of bytes after compression 120 // that have been written to the current row group so far. 121 func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 { 122 if fw.rgw != nil { 123 return fw.rgw.TotalCompressedBytes() 124 } 125 return 0 126 } 127 128 // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in 129 // the current row group. 130 func (fw *FileWriter) RowGroupTotalBytesWritten() int64 { 131 if fw.rgw != nil { 132 return fw.rgw.TotalBytesWritten() 133 } 134 return 0 135 } 136 137 // WriteBuffered will either append to an existing row group or create a new one 138 // based on the record length and max row group length. 139 // 140 // Additionally, it allows to manually break your row group by 141 // checking RowGroupTotalBytesWritten and calling NewBufferedRowGroup, 142 // while Write will always create at least 1 row group for the record. 143 // 144 // Performance-wise WriteBuffered might be more favorable than Write if you're dealing with: 145 // * a loose memory environment (meaning you have a lot of memory to utilize) 146 // * records that have only a small (~<1K?) amount of rows 147 // 148 // More memory is utilized compared to Write as the whole row group data is kept in memory before it's written 149 // since Parquet files must have an entire column written before writing the next column. 150 func (fw *FileWriter) WriteBuffered(rec arrow.Record) error { 151 if !rec.Schema().Equal(fw.schema) { 152 return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema) 153 } 154 155 var ( 156 recList []arrow.Record 157 maxRows = fw.wr.Properties().MaxRowGroupLength() 158 curRows int 159 err error 160 ) 161 if fw.rgw != nil { 162 if curRows, err = fw.rgw.NumRows(); err != nil { 163 return err 164 } 165 } else { 166 fw.NewBufferedRowGroup() 167 } 168 169 if int64(curRows)+rec.NumRows() <= maxRows { 170 recList = []arrow.Record{rec} 171 } else { 172 recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))} 173 defer recList[0].Release() 174 for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows { 175 s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset)) 176 defer s.Release() 177 recList = append(recList, s) 178 } 179 } 180 181 for idx, r := range recList { 182 if idx > 0 { 183 fw.NewBufferedRowGroup() 184 } 185 for i := 0; i < int(r.NumCols()); i++ { 186 if err := fw.WriteColumnData(r.Column(i)); err != nil { 187 fw.Close() 188 return err 189 } 190 } 191 } 192 fw.colIdx = 0 193 return nil 194 } 195 196 // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer 197 // properties to determine whether the record is broken up into more than one row group. 198 // At the very least a single row group is created per record, 199 // so calling Write always results in a new row group added. 200 // 201 // Performance-wise Write might be more favorable than WriteBuffered if you're dealing with: 202 // * a highly-restricted memory environment 203 // * very large records with lots of rows (potentially close to the max row group length) 204 func (fw *FileWriter) Write(rec arrow.Record) error { 205 if !rec.Schema().Equal(fw.schema) { 206 return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema) 207 } 208 209 var recList []arrow.Record 210 rowgroupLen := fw.wr.Properties().MaxRowGroupLength() 211 if rec.NumRows() > rowgroupLen { 212 recList = make([]arrow.Record, 0) 213 for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen { 214 s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset)) 215 defer s.Release() 216 recList = append(recList, s) 217 } 218 } else { 219 recList = []arrow.Record{rec} 220 } 221 222 for _, r := range recList { 223 fw.NewRowGroup() 224 for i := 0; i < int(r.NumCols()); i++ { 225 if err := fw.WriteColumnData(r.Column(i)); err != nil { 226 fw.Close() 227 return err 228 } 229 } 230 } 231 fw.colIdx = 0 232 return nil 233 } 234 235 // WriteTable writes an arrow table to the underlying file using chunkSize to determine 236 // the size to break at for making row groups. Writing a table will always create a new 237 // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will 238 // still write a 0 length Row Group to the file. 239 func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error { 240 if chunkSize <= 0 && tbl.NumRows() > 0 { 241 return xerrors.New("chunk size per row group must be greater than 0") 242 } else if !tbl.Schema().Equal(fw.schema) { 243 return fmt.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema) 244 } else if chunkSize > fw.wr.Properties().MaxRowGroupLength() { 245 chunkSize = fw.wr.Properties().MaxRowGroupLength() 246 } 247 248 writeRowGroup := func(offset, size int64) error { 249 fw.NewRowGroup() 250 for i := 0; i < int(tbl.NumCols()); i++ { 251 if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil { 252 return err 253 } 254 } 255 return nil 256 } 257 258 if tbl.NumRows() == 0 { 259 if err := writeRowGroup(0, 0); err != nil { 260 fw.Close() 261 return err 262 } 263 return nil 264 } 265 266 for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize { 267 if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil { 268 fw.Close() 269 return err 270 } 271 } 272 return nil 273 } 274 275 // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata 276 func (fw *FileWriter) AppendKeyValueMetadata(key string, value string) error { 277 return fw.wr.AppendKeyValueMetadata(key, value) 278 } 279 280 // Close flushes out the data and closes the file. It can be called multiple times, 281 // subsequent calls after the first will have no effect. 282 func (fw *FileWriter) Close() error { 283 if !fw.closed { 284 fw.closed = true 285 if fw.rgw != nil { 286 if err := fw.rgw.Close(); err != nil { 287 return err 288 } 289 } 290 291 writeCtx := arrowCtxFromContext(fw.ctx) 292 if writeCtx.dataBuffer != nil { 293 writeCtx.dataBuffer.Release() 294 writeCtx.dataBuffer = nil 295 } 296 297 return fw.wr.Close() 298 } 299 return nil 300 } 301 302 // WriteColumnChunked will write the data provided to the underlying file, using the provided 303 // offset and size to allow writing subsets of data from the chunked column. It uses the current 304 // column in the underlying row group writer as the starting point, allowing progressive 305 // building of writing columns to a file via arrow data without needing to already have 306 // a record or table. 307 func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error { 308 acw, err := NewArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx) 309 if err != nil { 310 return err 311 } 312 fw.colIdx += acw.leafCount 313 return acw.Write(fw.ctx) 314 } 315 316 // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked 317 // it is based on the current column of the row group writer allowing progressive building 318 // of the file by columns without needing a full record or table to write. 319 func (fw *FileWriter) WriteColumnData(data arrow.Array) error { 320 chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data}) 321 defer chunked.Release() 322 return fw.WriteColumnChunked(chunked, 0, int64(data.Len())) 323 }