github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/file_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "context" 21 "encoding/base64" 22 "io" 23 24 "github.com/apache/arrow/go/v7/arrow" 25 "github.com/apache/arrow/go/v7/arrow/flight" 26 "github.com/apache/arrow/go/v7/parquet" 27 "github.com/apache/arrow/go/v7/parquet/file" 28 "github.com/apache/arrow/go/v7/parquet/internal/utils" 29 "github.com/apache/arrow/go/v7/parquet/metadata" 30 "golang.org/x/xerrors" 31 ) 32 33 // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema 34 // and columns will be determined by the schema of the table, writing the file out to the the provided writer. 35 // The chunksize will be utilized in order to determine the size of the row groups. 36 func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error { 37 writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops) 38 if err != nil { 39 return err 40 } 41 42 if err := writer.WriteTable(tbl, chunkSize); err != nil { 43 return err 44 } 45 46 return writer.Close() 47 } 48 49 // FileWriter is an object for writing Arrow directly to a parquet file. 50 type FileWriter struct { 51 wr *file.Writer 52 schema *arrow.Schema 53 manifest *SchemaManifest 54 rgw file.RowGroupWriter 55 arrowProps ArrowWriterProperties 56 ctx context.Context 57 colIdx int 58 closed bool 59 } 60 61 // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than 62 // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing 63 // file.Writer, this will create a new file.Writer based on the schema provided. 64 func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) { 65 if props == nil { 66 props = parquet.NewWriterProperties() 67 } 68 69 pqschema, err := ToParquet(arrschema, props, arrprops) 70 if err != nil { 71 return nil, err 72 } 73 74 meta := make(metadata.KeyValueMetadata, 0) 75 if arrprops.storeSchema { 76 for i := 0; i < arrschema.Metadata().Len(); i++ { 77 meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i]) 78 } 79 80 serializedSchema := flight.SerializeSchema(arrschema, props.Allocator()) 81 meta.Append("ARROW:schema", base64.RawStdEncoding.EncodeToString(serializedSchema)) 82 } 83 84 schemaNode := pqschema.Root() 85 baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta)) 86 87 manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{}) 88 if err != nil { 89 return nil, err 90 } 91 92 return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil 93 } 94 95 // NewRowGroup does what it says on the tin, creates a new row group in the underlying file. 96 // Equivalent to `AppendRowGroup` on a file.Writer 97 func (fw *FileWriter) NewRowGroup() { 98 if fw.rgw != nil { 99 fw.rgw.Close() 100 } 101 fw.rgw = fw.wr.AppendRowGroup() 102 fw.colIdx = 0 103 } 104 105 // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records 106 // without immediately flushing them to disk. This allows using WriteBuffered to write records 107 // and decide where to break your rowgroup based on the TotalBytesWritten rather than on the max 108 // row group len. If using Records, this should be paired with WriteBuffered, while 109 // Write will always write a new record as a row group in and of itself. 110 func (fw *FileWriter) NewBufferedRowGroup() { 111 if fw.rgw != nil { 112 fw.rgw.Close() 113 } 114 fw.rgw = fw.wr.AppendBufferedRowGroup() 115 fw.colIdx = 0 116 } 117 118 // RowGroupTotalCompressedBytes returns the total number of bytes after compression 119 // that have been written to the current row group so far. 120 func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 { 121 if fw.rgw != nil { 122 return fw.rgw.TotalCompressedBytes() 123 } 124 return 0 125 } 126 127 // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in 128 // the current row group. 129 func (fw *FileWriter) RowGroupTotalBytesWritten() int64 { 130 if fw.rgw != nil { 131 return fw.rgw.TotalBytesWritten() 132 } 133 return 0 134 } 135 136 func (fw *FileWriter) WriteBuffered(rec arrow.Record) error { 137 if !rec.Schema().Equal(fw.schema) { 138 return xerrors.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema) 139 } 140 141 var ( 142 recList []arrow.Record 143 maxRows = fw.wr.Properties().MaxRowGroupLength() 144 curRows int 145 err error 146 ) 147 if fw.rgw != nil { 148 if curRows, err = fw.rgw.NumRows(); err != nil { 149 return err 150 } 151 } else { 152 fw.NewBufferedRowGroup() 153 } 154 155 if int64(curRows)+rec.NumRows() <= maxRows { 156 recList = []arrow.Record{rec} 157 } else { 158 recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))} 159 defer recList[0].Release() 160 for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows { 161 s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset)) 162 defer s.Release() 163 recList = append(recList, s) 164 } 165 } 166 167 for idx, r := range recList { 168 if idx > 0 { 169 fw.NewBufferedRowGroup() 170 } 171 for i := 0; i < int(r.NumCols()); i++ { 172 if err := fw.WriteColumnData(r.Column(i)); err != nil { 173 fw.Close() 174 return err 175 } 176 } 177 } 178 fw.colIdx = 0 179 return nil 180 } 181 182 // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer 183 // properties to determine whether or not a new row group is created while writing. 184 func (fw *FileWriter) Write(rec arrow.Record) error { 185 if !rec.Schema().Equal(fw.schema) { 186 return xerrors.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema) 187 } 188 189 var recList []arrow.Record 190 rowgroupLen := fw.wr.Properties().MaxRowGroupLength() 191 if rec.NumRows() > rowgroupLen { 192 recList = make([]arrow.Record, 0) 193 for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen { 194 s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset)) 195 defer s.Release() 196 recList = append(recList, s) 197 } 198 } else { 199 recList = []arrow.Record{rec} 200 } 201 202 for _, r := range recList { 203 fw.NewRowGroup() 204 for i := 0; i < int(r.NumCols()); i++ { 205 if err := fw.WriteColumnData(r.Column(i)); err != nil { 206 fw.Close() 207 return err 208 } 209 } 210 } 211 fw.colIdx = 0 212 return nil 213 } 214 215 // WriteTable writes an arrow table to the underlying file using chunkSize to determine 216 // the size to break at for making row groups. Writing a table will always create a new 217 // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will 218 // still write a 0 length Row Group to the file. 219 func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error { 220 if chunkSize <= 0 && tbl.NumRows() > 0 { 221 return xerrors.New("chunk size per row group must be greater than 0") 222 } else if !tbl.Schema().Equal(fw.schema) { 223 return xerrors.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema) 224 } else if chunkSize > fw.wr.Properties().MaxRowGroupLength() { 225 chunkSize = fw.wr.Properties().MaxRowGroupLength() 226 } 227 228 writeRowGroup := func(offset, size int64) error { 229 fw.NewRowGroup() 230 for i := 0; i < int(tbl.NumCols()); i++ { 231 if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil { 232 return err 233 } 234 } 235 return nil 236 } 237 238 if tbl.NumRows() == 0 { 239 if err := writeRowGroup(0, 0); err != nil { 240 fw.Close() 241 return err 242 } 243 return nil 244 } 245 246 for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize { 247 if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil { 248 fw.Close() 249 return err 250 } 251 } 252 return nil 253 } 254 255 // Close flushes out the data and closes the file. It can be called multiple times, 256 // subsequent calls after the first will have no effect. 257 func (fw *FileWriter) Close() error { 258 if !fw.closed { 259 fw.closed = true 260 if fw.rgw != nil { 261 if err := fw.rgw.Close(); err != nil { 262 return err 263 } 264 } 265 return fw.wr.Close() 266 } 267 return nil 268 } 269 270 // WriteColumnChunked will write the data provided to the underlying file, using the provided 271 // offset and size to allow writing subsets of data from the chunked column. It uses the current 272 // column in the underlying row group writer as the starting point, allowing progressive 273 // building of writing columns to a file via arrow data without needing to already have 274 // a record or table. 275 func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error { 276 acw, err := NewArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx) 277 if err != nil { 278 return err 279 } 280 fw.colIdx += acw.leafCount 281 return acw.Write(fw.ctx) 282 } 283 284 // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked 285 // it is based on the current column of the row group writer allowing progressive building 286 // of the file by columns without needing a full record or table to write. 287 func (fw *FileWriter) WriteColumnData(data arrow.Array) error { 288 chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data}) 289 defer chunked.Release() 290 return fw.WriteColumnChunked(chunked, 0, int64(data.Len())) 291 }