github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/file_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "context" 21 "encoding/base64" 22 "fmt" 23 "io" 24 25 "github.com/apache/arrow/go/v10/arrow" 26 "github.com/apache/arrow/go/v10/arrow/flight" 27 "github.com/apache/arrow/go/v10/internal/utils" 28 "github.com/apache/arrow/go/v10/parquet" 29 "github.com/apache/arrow/go/v10/parquet/file" 30 "github.com/apache/arrow/go/v10/parquet/metadata" 31 "golang.org/x/xerrors" 32 ) 33 34 // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema 35 // and columns will be determined by the schema of the table, writing the file out to the the provided writer. 36 // The chunksize will be utilized in order to determine the size of the row groups. 37 func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error { 38 writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops) 39 if err != nil { 40 return err 41 } 42 43 if err := writer.WriteTable(tbl, chunkSize); err != nil { 44 return err 45 } 46 47 return writer.Close() 48 } 49 50 // FileWriter is an object for writing Arrow directly to a parquet file. 51 type FileWriter struct { 52 wr *file.Writer 53 schema *arrow.Schema 54 manifest *SchemaManifest 55 rgw file.RowGroupWriter 56 arrowProps ArrowWriterProperties 57 ctx context.Context 58 colIdx int 59 closed bool 60 } 61 62 // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than 63 // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing 64 // file.Writer, this will create a new file.Writer based on the schema provided. 65 func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) { 66 if props == nil { 67 props = parquet.NewWriterProperties() 68 } 69 70 pqschema, err := ToParquet(arrschema, props, arrprops) 71 if err != nil { 72 return nil, err 73 } 74 75 meta := make(metadata.KeyValueMetadata, 0) 76 for i := 0; i < arrschema.Metadata().Len(); i++ { 77 meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i]) 78 } 79 80 if arrprops.storeSchema { 81 serializedSchema := flight.SerializeSchema(arrschema, props.Allocator()) 82 meta.Append("ARROW:schema", base64.StdEncoding.EncodeToString(serializedSchema)) 83 } 84 85 schemaNode := pqschema.Root() 86 baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta)) 87 88 manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{}) 89 if err != nil { 90 return nil, err 91 } 92 93 return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil 94 } 95 96 // NewRowGroup does what it says on the tin, creates a new row group in the underlying file. 97 // Equivalent to `AppendRowGroup` on a file.Writer 98 func (fw *FileWriter) NewRowGroup() { 99 if fw.rgw != nil { 100 fw.rgw.Close() 101 } 102 fw.rgw = fw.wr.AppendRowGroup() 103 fw.colIdx = 0 104 } 105 106 // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records 107 // without immediately flushing them to disk. This allows using WriteBuffered to write records 108 // and decide where to break your rowgroup based on the TotalBytesWritten rather than on the max 109 // row group len. If using Records, this should be paired with WriteBuffered, while 110 // Write will always write a new record as a row group in and of itself. 111 func (fw *FileWriter) NewBufferedRowGroup() { 112 if fw.rgw != nil { 113 fw.rgw.Close() 114 } 115 fw.rgw = fw.wr.AppendBufferedRowGroup() 116 fw.colIdx = 0 117 } 118 119 // RowGroupTotalCompressedBytes returns the total number of bytes after compression 120 // that have been written to the current row group so far. 121 func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 { 122 if fw.rgw != nil { 123 return fw.rgw.TotalCompressedBytes() 124 } 125 return 0 126 } 127 128 // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in 129 // the current row group. 130 func (fw *FileWriter) RowGroupTotalBytesWritten() int64 { 131 if fw.rgw != nil { 132 return fw.rgw.TotalBytesWritten() 133 } 134 return 0 135 } 136 137 func (fw *FileWriter) WriteBuffered(rec arrow.Record) error { 138 if !rec.Schema().Equal(fw.schema) { 139 return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema) 140 } 141 142 var ( 143 recList []arrow.Record 144 maxRows = fw.wr.Properties().MaxRowGroupLength() 145 curRows int 146 err error 147 ) 148 if fw.rgw != nil { 149 if curRows, err = fw.rgw.NumRows(); err != nil { 150 return err 151 } 152 } else { 153 fw.NewBufferedRowGroup() 154 } 155 156 if int64(curRows)+rec.NumRows() <= maxRows { 157 recList = []arrow.Record{rec} 158 } else { 159 recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))} 160 defer recList[0].Release() 161 for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows { 162 s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset)) 163 defer s.Release() 164 recList = append(recList, s) 165 } 166 } 167 168 for idx, r := range recList { 169 if idx > 0 { 170 fw.NewBufferedRowGroup() 171 } 172 for i := 0; i < int(r.NumCols()); i++ { 173 if err := fw.WriteColumnData(r.Column(i)); err != nil { 174 fw.Close() 175 return err 176 } 177 } 178 } 179 fw.colIdx = 0 180 return nil 181 } 182 183 // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer 184 // properties to determine whether or not a new row group is created while writing. 185 func (fw *FileWriter) Write(rec arrow.Record) error { 186 if !rec.Schema().Equal(fw.schema) { 187 return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema) 188 } 189 190 var recList []arrow.Record 191 rowgroupLen := fw.wr.Properties().MaxRowGroupLength() 192 if rec.NumRows() > rowgroupLen { 193 recList = make([]arrow.Record, 0) 194 for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen { 195 s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset)) 196 defer s.Release() 197 recList = append(recList, s) 198 } 199 } else { 200 recList = []arrow.Record{rec} 201 } 202 203 for _, r := range recList { 204 fw.NewRowGroup() 205 for i := 0; i < int(r.NumCols()); i++ { 206 if err := fw.WriteColumnData(r.Column(i)); err != nil { 207 fw.Close() 208 return err 209 } 210 } 211 } 212 fw.colIdx = 0 213 return nil 214 } 215 216 // WriteTable writes an arrow table to the underlying file using chunkSize to determine 217 // the size to break at for making row groups. Writing a table will always create a new 218 // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will 219 // still write a 0 length Row Group to the file. 220 func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error { 221 if chunkSize <= 0 && tbl.NumRows() > 0 { 222 return xerrors.New("chunk size per row group must be greater than 0") 223 } else if !tbl.Schema().Equal(fw.schema) { 224 return fmt.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema) 225 } else if chunkSize > fw.wr.Properties().MaxRowGroupLength() { 226 chunkSize = fw.wr.Properties().MaxRowGroupLength() 227 } 228 229 writeRowGroup := func(offset, size int64) error { 230 fw.NewRowGroup() 231 for i := 0; i < int(tbl.NumCols()); i++ { 232 if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil { 233 return err 234 } 235 } 236 return nil 237 } 238 239 if tbl.NumRows() == 0 { 240 if err := writeRowGroup(0, 0); err != nil { 241 fw.Close() 242 return err 243 } 244 return nil 245 } 246 247 for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize { 248 if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil { 249 fw.Close() 250 return err 251 } 252 } 253 return nil 254 } 255 256 // Close flushes out the data and closes the file. It can be called multiple times, 257 // subsequent calls after the first will have no effect. 258 func (fw *FileWriter) Close() error { 259 if !fw.closed { 260 fw.closed = true 261 if fw.rgw != nil { 262 if err := fw.rgw.Close(); err != nil { 263 return err 264 } 265 } 266 return fw.wr.Close() 267 } 268 return nil 269 } 270 271 // WriteColumnChunked will write the data provided to the underlying file, using the provided 272 // offset and size to allow writing subsets of data from the chunked column. It uses the current 273 // column in the underlying row group writer as the starting point, allowing progressive 274 // building of writing columns to a file via arrow data without needing to already have 275 // a record or table. 276 func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error { 277 acw, err := NewArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx) 278 if err != nil { 279 return err 280 } 281 fw.colIdx += acw.leafCount 282 return acw.Write(fw.ctx) 283 } 284 285 // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked 286 // it is based on the current column of the row group writer allowing progressive building 287 // of the file by columns without needing a full record or table to write. 288 func (fw *FileWriter) WriteColumnData(data arrow.Array) error { 289 chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data}) 290 defer chunked.Release() 291 return fw.WriteColumnChunked(chunked, 0, int64(data.Len())) 292 }