github.com/apache/arrow/go/v16@v16.1.0/parquet/pqarrow/file_writer.go

github.com/apache/arrow/go/v16@v16.1.0/parquet/pqarrow/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  	"encoding/base64"
    22  	"fmt"
    23  	"io"
    24  
    25  	"github.com/apache/arrow/go/v16/arrow"
    26  	"github.com/apache/arrow/go/v16/arrow/flight"
    27  	"github.com/apache/arrow/go/v16/internal/utils"
    28  	"github.com/apache/arrow/go/v16/parquet"
    29  	"github.com/apache/arrow/go/v16/parquet/file"
    30  	"github.com/apache/arrow/go/v16/parquet/metadata"
    31  	"golang.org/x/xerrors"
    32  )
    33  
    34  // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema
    35  // and columns will be determined by the schema of the table, writing the file out to the provided writer.
    36  // The chunksize will be utilized in order to determine the size of the row groups.
    37  func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error {
    38  	writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops)
    39  	if err != nil {
    40  		return err
    41  	}
    42  
    43  	if err := writer.WriteTable(tbl, chunkSize); err != nil {
    44  		return err
    45  	}
    46  
    47  	return writer.Close()
    48  }
    49  
    50  // FileWriter is an object for writing Arrow directly to a parquet file.
    51  type FileWriter struct {
    52  	wr         *file.Writer
    53  	schema     *arrow.Schema
    54  	manifest   *SchemaManifest
    55  	rgw        file.RowGroupWriter
    56  	arrowProps ArrowWriterProperties
    57  	ctx        context.Context
    58  	colIdx     int
    59  	closed     bool
    60  }
    61  
    62  // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than
    63  // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing
    64  // file.Writer, this will create a new file.Writer based on the schema provided.
    65  func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) {
    66  	if props == nil {
    67  		props = parquet.NewWriterProperties()
    68  	}
    69  
    70  	pqschema, err := ToParquet(arrschema, props, arrprops)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  
    75  	meta := make(metadata.KeyValueMetadata, 0)
    76  	for i := 0; i < arrschema.Metadata().Len(); i++ {
    77  		meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i])
    78  	}
    79  
    80  	if arrprops.storeSchema {
    81  		serializedSchema := flight.SerializeSchema(arrschema, props.Allocator())
    82  		meta.Append("ARROW:schema", base64.StdEncoding.EncodeToString(serializedSchema))
    83  	}
    84  
    85  	schemaNode := pqschema.Root()
    86  	baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta))
    87  
    88  	manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{})
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  
    93  	return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil
    94  }
    95  
    96  // NewRowGroup does what it says on the tin, creates a new row group in the underlying file.
    97  // Equivalent to `AppendRowGroup` on a file.Writer
    98  func (fw *FileWriter) NewRowGroup() {
    99  	if fw.rgw != nil {
   100  		fw.rgw.Close()
   101  	}
   102  	fw.rgw = fw.wr.AppendRowGroup()
   103  	fw.colIdx = 0
   104  }
   105  
   106  // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records
   107  // without immediately flushing them to disk. This allows using WriteBuffered to write records
   108  // and decide where to break your row group based on the TotalBytesWritten rather than on the max
   109  // row group len. If using Records, this should be paired with WriteBuffered, while
   110  // Write will always write a new record as a row group in and of itself.
   111  func (fw *FileWriter) NewBufferedRowGroup() {
   112  	if fw.rgw != nil {
   113  		fw.rgw.Close()
   114  	}
   115  	fw.rgw = fw.wr.AppendBufferedRowGroup()
   116  	fw.colIdx = 0
   117  }
   118  
   119  // RowGroupTotalCompressedBytes returns the total number of bytes after compression
   120  // that have been written to the current row group so far.
   121  func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 {
   122  	if fw.rgw != nil {
   123  		return fw.rgw.TotalCompressedBytes()
   124  	}
   125  	return 0
   126  }
   127  
   128  // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in
   129  // the current row group.
   130  func (fw *FileWriter) RowGroupTotalBytesWritten() int64 {
   131  	if fw.rgw != nil {
   132  		return fw.rgw.TotalBytesWritten()
   133  	}
   134  	return 0
   135  }
   136  
   137  // RowGroupNumRows returns the number of rows written to the current row group.
   138  // Returns an error if they are unequal between columns that have been written so far.
   139  func (fw *FileWriter) RowGroupNumRows() (int, error) {
   140  	if fw.rgw != nil {
   141  		return fw.rgw.NumRows()
   142  	}
   143  	return 0, nil
   144  }
   145  
   146  // NumRows returns the total number of rows that have been written so far.
   147  func (fw *FileWriter) NumRows() int {
   148  	if fw.wr != nil {
   149  		return fw.wr.NumRows()
   150  	}
   151  	return 0
   152  }
   153  
   154  // WriteBuffered will either append to an existing row group or create a new one
   155  // based on the record length and max row group length.
   156  //
   157  // Additionally, it allows to manually break your row group by
   158  // checking RowGroupTotalBytesWritten and calling NewBufferedRowGroup,
   159  // while Write will always create at least 1 row group for the record.
   160  //
   161  // Performance-wise WriteBuffered might be more favorable than Write if you're dealing with:
   162  // * a loose memory environment (meaning you have a lot of memory to utilize)
   163  // * records that have only a small (~<1K?) amount of rows
   164  //
   165  // More memory is utilized compared to Write as the whole row group data is kept in memory before it's written
   166  // since Parquet files must have an entire column written before writing the next column.
   167  func (fw *FileWriter) WriteBuffered(rec arrow.Record) error {
   168  	if !rec.Schema().Equal(fw.schema) {
   169  		return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   170  	}
   171  
   172  	var (
   173  		recList []arrow.Record
   174  		maxRows = fw.wr.Properties().MaxRowGroupLength()
   175  		curRows int
   176  		err     error
   177  	)
   178  	if fw.rgw != nil {
   179  		if curRows, err = fw.rgw.NumRows(); err != nil {
   180  			return err
   181  		}
   182  	} else {
   183  		fw.NewBufferedRowGroup()
   184  	}
   185  
   186  	if int64(curRows)+rec.NumRows() <= maxRows {
   187  		recList = []arrow.Record{rec}
   188  	} else {
   189  		recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))}
   190  		defer recList[0].Release()
   191  		for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows {
   192  			s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset))
   193  			defer s.Release()
   194  			recList = append(recList, s)
   195  		}
   196  	}
   197  
   198  	for idx, r := range recList {
   199  		if idx > 0 {
   200  			fw.NewBufferedRowGroup()
   201  		}
   202  		for i := 0; i < int(r.NumCols()); i++ {
   203  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   204  				fw.Close()
   205  				return err
   206  			}
   207  		}
   208  	}
   209  	fw.colIdx = 0
   210  	return nil
   211  }
   212  
   213  // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer
   214  // properties to determine whether the record is broken up into more than one row group.
   215  // At the very least a single row group is created per record,
   216  // so calling Write always results in a new row group added.
   217  //
   218  // Performance-wise Write might be more favorable than WriteBuffered if you're dealing with:
   219  // * a highly-restricted memory environment
   220  // * very large records with lots of rows (potentially close to the max row group length)
   221  func (fw *FileWriter) Write(rec arrow.Record) error {
   222  	if !rec.Schema().Equal(fw.schema) {
   223  		return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   224  	}
   225  
   226  	var recList []arrow.Record
   227  	rowgroupLen := fw.wr.Properties().MaxRowGroupLength()
   228  	if rec.NumRows() > rowgroupLen {
   229  		recList = make([]arrow.Record, 0)
   230  		for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen {
   231  			s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset))
   232  			defer s.Release()
   233  			recList = append(recList, s)
   234  		}
   235  	} else {
   236  		recList = []arrow.Record{rec}
   237  	}
   238  
   239  	for _, r := range recList {
   240  		fw.NewRowGroup()
   241  		for i := 0; i < int(r.NumCols()); i++ {
   242  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   243  				fw.Close()
   244  				return err
   245  			}
   246  		}
   247  	}
   248  	fw.colIdx = 0
   249  	return nil
   250  }
   251  
   252  // WriteTable writes an arrow table to the underlying file using chunkSize to determine
   253  // the size to break at for making row groups. Writing a table will always create a new
   254  // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will
   255  // still write a 0 length Row Group to the file.
   256  func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error {
   257  	if chunkSize <= 0 && tbl.NumRows() > 0 {
   258  		return xerrors.New("chunk size per row group must be greater than 0")
   259  	} else if !tbl.Schema().Equal(fw.schema) {
   260  		return fmt.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema)
   261  	} else if chunkSize > fw.wr.Properties().MaxRowGroupLength() {
   262  		chunkSize = fw.wr.Properties().MaxRowGroupLength()
   263  	}
   264  
   265  	writeRowGroup := func(offset, size int64) error {
   266  		fw.NewRowGroup()
   267  		for i := 0; i < int(tbl.NumCols()); i++ {
   268  			if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil {
   269  				return err
   270  			}
   271  		}
   272  		return nil
   273  	}
   274  
   275  	if tbl.NumRows() == 0 {
   276  		if err := writeRowGroup(0, 0); err != nil {
   277  			fw.Close()
   278  			return err
   279  		}
   280  		return nil
   281  	}
   282  
   283  	for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize {
   284  		if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil {
   285  			fw.Close()
   286  			return err
   287  		}
   288  	}
   289  	return nil
   290  }
   291  
   292  // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata
   293  func (fw *FileWriter) AppendKeyValueMetadata(key string, value string) error {
   294  	return fw.wr.AppendKeyValueMetadata(key, value)
   295  }
   296  
   297  // Close flushes out the data and closes the file. It can be called multiple times,
   298  // subsequent calls after the first will have no effect.
   299  func (fw *FileWriter) Close() error {
   300  	if !fw.closed {
   301  		fw.closed = true
   302  		if fw.rgw != nil {
   303  			if err := fw.rgw.Close(); err != nil {
   304  				return err
   305  			}
   306  		}
   307  
   308  		writeCtx := arrowCtxFromContext(fw.ctx)
   309  		if writeCtx.dataBuffer != nil {
   310  			writeCtx.dataBuffer.Release()
   311  			writeCtx.dataBuffer = nil
   312  		}
   313  
   314  		return fw.wr.Close()
   315  	}
   316  	return nil
   317  }
   318  
   319  // WriteColumnChunked will write the data provided to the underlying file, using the provided
   320  // offset and size to allow writing subsets of data from the chunked column. It uses the current
   321  // column in the underlying row group writer as the starting point, allowing progressive
   322  // building of writing columns to a file via arrow data without needing to already have
   323  // a record or table.
   324  func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error {
   325  	acw, err := newArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx)
   326  	if err != nil {
   327  		return err
   328  	}
   329  	fw.colIdx += acw.leafCount
   330  	return acw.Write(fw.ctx)
   331  }
   332  
   333  // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked
   334  // it is based on the current column of the row group writer allowing progressive building
   335  // of the file by columns without needing a full record or table to write.
   336  func (fw *FileWriter) WriteColumnData(data arrow.Array) error {
   337  	chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data})
   338  	defer chunked.Release()
   339  	return fw.WriteColumnChunked(chunked, 0, int64(data.Len()))
   340  }