github.com/apache/arrow/go/v14@v14.0.1/parquet/pqarrow/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  	"encoding/base64"
    22  	"fmt"
    23  	"io"
    24  
    25  	"github.com/apache/arrow/go/v14/arrow"
    26  	"github.com/apache/arrow/go/v14/arrow/flight"
    27  	"github.com/apache/arrow/go/v14/internal/utils"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/file"
    30  	"github.com/apache/arrow/go/v14/parquet/metadata"
    31  	"golang.org/x/xerrors"
    32  )
    33  
    34  // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema
    35  // and columns will be determined by the schema of the table, writing the file out to the provided writer.
    36  // The chunksize will be utilized in order to determine the size of the row groups.
    37  func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error {
    38  	writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops)
    39  	if err != nil {
    40  		return err
    41  	}
    42  
    43  	if err := writer.WriteTable(tbl, chunkSize); err != nil {
    44  		return err
    45  	}
    46  
    47  	return writer.Close()
    48  }
    49  
    50  // FileWriter is an object for writing Arrow directly to a parquet file.
    51  type FileWriter struct {
    52  	wr         *file.Writer
    53  	schema     *arrow.Schema
    54  	manifest   *SchemaManifest
    55  	rgw        file.RowGroupWriter
    56  	arrowProps ArrowWriterProperties
    57  	ctx        context.Context
    58  	colIdx     int
    59  	closed     bool
    60  }
    61  
    62  // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than
    63  // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing
    64  // file.Writer, this will create a new file.Writer based on the schema provided.
    65  func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) {
    66  	if props == nil {
    67  		props = parquet.NewWriterProperties()
    68  	}
    69  
    70  	pqschema, err := ToParquet(arrschema, props, arrprops)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  
    75  	meta := make(metadata.KeyValueMetadata, 0)
    76  	for i := 0; i < arrschema.Metadata().Len(); i++ {
    77  		meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i])
    78  	}
    79  
    80  	if arrprops.storeSchema {
    81  		serializedSchema := flight.SerializeSchema(arrschema, props.Allocator())
    82  		meta.Append("ARROW:schema", base64.StdEncoding.EncodeToString(serializedSchema))
    83  	}
    84  
    85  	schemaNode := pqschema.Root()
    86  	baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta))
    87  
    88  	manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{})
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  
    93  	return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil
    94  }
    95  
    96  // NewRowGroup does what it says on the tin, creates a new row group in the underlying file.
    97  // Equivalent to `AppendRowGroup` on a file.Writer
    98  func (fw *FileWriter) NewRowGroup() {
    99  	if fw.rgw != nil {
   100  		fw.rgw.Close()
   101  	}
   102  	fw.rgw = fw.wr.AppendRowGroup()
   103  	fw.colIdx = 0
   104  }
   105  
   106  // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records
   107  // without immediately flushing them to disk. This allows using WriteBuffered to write records
   108  // and decide where to break your row group based on the TotalBytesWritten rather than on the max
   109  // row group len. If using Records, this should be paired with WriteBuffered, while
   110  // Write will always write a new record as a row group in and of itself.
   111  func (fw *FileWriter) NewBufferedRowGroup() {
   112  	if fw.rgw != nil {
   113  		fw.rgw.Close()
   114  	}
   115  	fw.rgw = fw.wr.AppendBufferedRowGroup()
   116  	fw.colIdx = 0
   117  }
   118  
   119  // RowGroupTotalCompressedBytes returns the total number of bytes after compression
   120  // that have been written to the current row group so far.
   121  func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 {
   122  	if fw.rgw != nil {
   123  		return fw.rgw.TotalCompressedBytes()
   124  	}
   125  	return 0
   126  }
   127  
   128  // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in
   129  // the current row group.
   130  func (fw *FileWriter) RowGroupTotalBytesWritten() int64 {
   131  	if fw.rgw != nil {
   132  		return fw.rgw.TotalBytesWritten()
   133  	}
   134  	return 0
   135  }
   136  
   137  // WriteBuffered will either append to an existing row group or create a new one
   138  // based on the record length and max row group length.
   139  //
   140  // Additionally, it allows to manually break your row group by
   141  // checking RowGroupTotalBytesWritten and calling NewBufferedRowGroup,
   142  // while Write will always create at least 1 row group for the record.
   143  //
   144  // Performance-wise WriteBuffered might be more favorable than Write if you're dealing with:
   145  // * a loose memory environment (meaning you have a lot of memory to utilize)
   146  // * records that have only a small (~<1K?) amount of rows
   147  //
   148  // More memory is utilized compared to Write as the whole row group data is kept in memory before it's written
   149  // since Parquet files must have an entire column written before writing the next column.
   150  func (fw *FileWriter) WriteBuffered(rec arrow.Record) error {
   151  	if !rec.Schema().Equal(fw.schema) {
   152  		return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   153  	}
   154  
   155  	var (
   156  		recList []arrow.Record
   157  		maxRows = fw.wr.Properties().MaxRowGroupLength()
   158  		curRows int
   159  		err     error
   160  	)
   161  	if fw.rgw != nil {
   162  		if curRows, err = fw.rgw.NumRows(); err != nil {
   163  			return err
   164  		}
   165  	} else {
   166  		fw.NewBufferedRowGroup()
   167  	}
   168  
   169  	if int64(curRows)+rec.NumRows() <= maxRows {
   170  		recList = []arrow.Record{rec}
   171  	} else {
   172  		recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))}
   173  		defer recList[0].Release()
   174  		for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows {
   175  			s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset))
   176  			defer s.Release()
   177  			recList = append(recList, s)
   178  		}
   179  	}
   180  
   181  	for idx, r := range recList {
   182  		if idx > 0 {
   183  			fw.NewBufferedRowGroup()
   184  		}
   185  		for i := 0; i < int(r.NumCols()); i++ {
   186  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   187  				fw.Close()
   188  				return err
   189  			}
   190  		}
   191  	}
   192  	fw.colIdx = 0
   193  	return nil
   194  }
   195  
   196  // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer
   197  // properties to determine whether the record is broken up into more than one row group.
   198  // At the very least a single row group is created per record,
   199  // so calling Write always results in a new row group added.
   200  //
   201  // Performance-wise Write might be more favorable than WriteBuffered if you're dealing with:
   202  // * a highly-restricted memory environment
   203  // * very large records with lots of rows (potentially close to the max row group length)
   204  func (fw *FileWriter) Write(rec arrow.Record) error {
   205  	if !rec.Schema().Equal(fw.schema) {
   206  		return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   207  	}
   208  
   209  	var recList []arrow.Record
   210  	rowgroupLen := fw.wr.Properties().MaxRowGroupLength()
   211  	if rec.NumRows() > rowgroupLen {
   212  		recList = make([]arrow.Record, 0)
   213  		for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen {
   214  			s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset))
   215  			defer s.Release()
   216  			recList = append(recList, s)
   217  		}
   218  	} else {
   219  		recList = []arrow.Record{rec}
   220  	}
   221  
   222  	for _, r := range recList {
   223  		fw.NewRowGroup()
   224  		for i := 0; i < int(r.NumCols()); i++ {
   225  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   226  				fw.Close()
   227  				return err
   228  			}
   229  		}
   230  	}
   231  	fw.colIdx = 0
   232  	return nil
   233  }
   234  
   235  // WriteTable writes an arrow table to the underlying file using chunkSize to determine
   236  // the size to break at for making row groups. Writing a table will always create a new
   237  // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will
   238  // still write a 0 length Row Group to the file.
   239  func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error {
   240  	if chunkSize <= 0 && tbl.NumRows() > 0 {
   241  		return xerrors.New("chunk size per row group must be greater than 0")
   242  	} else if !tbl.Schema().Equal(fw.schema) {
   243  		return fmt.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema)
   244  	} else if chunkSize > fw.wr.Properties().MaxRowGroupLength() {
   245  		chunkSize = fw.wr.Properties().MaxRowGroupLength()
   246  	}
   247  
   248  	writeRowGroup := func(offset, size int64) error {
   249  		fw.NewRowGroup()
   250  		for i := 0; i < int(tbl.NumCols()); i++ {
   251  			if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil {
   252  				return err
   253  			}
   254  		}
   255  		return nil
   256  	}
   257  
   258  	if tbl.NumRows() == 0 {
   259  		if err := writeRowGroup(0, 0); err != nil {
   260  			fw.Close()
   261  			return err
   262  		}
   263  		return nil
   264  	}
   265  
   266  	for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize {
   267  		if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil {
   268  			fw.Close()
   269  			return err
   270  		}
   271  	}
   272  	return nil
   273  }
   274  
   275  // AppendKeyValueMetadata appends a key/value pair to the existing key/value metadata
   276  func (fw *FileWriter) AppendKeyValueMetadata(key string, value string) error {
   277  	return fw.wr.AppendKeyValueMetadata(key, value)
   278  }
   279  
   280  // Close flushes out the data and closes the file. It can be called multiple times,
   281  // subsequent calls after the first will have no effect.
   282  func (fw *FileWriter) Close() error {
   283  	if !fw.closed {
   284  		fw.closed = true
   285  		if fw.rgw != nil {
   286  			if err := fw.rgw.Close(); err != nil {
   287  				return err
   288  			}
   289  		}
   290  
   291  		writeCtx := arrowCtxFromContext(fw.ctx)
   292  		if writeCtx.dataBuffer != nil {
   293  			writeCtx.dataBuffer.Release()
   294  			writeCtx.dataBuffer = nil
   295  		}
   296  
   297  		return fw.wr.Close()
   298  	}
   299  	return nil
   300  }
   301  
   302  // WriteColumnChunked will write the data provided to the underlying file, using the provided
   303  // offset and size to allow writing subsets of data from the chunked column. It uses the current
   304  // column in the underlying row group writer as the starting point, allowing progressive
   305  // building of writing columns to a file via arrow data without needing to already have
   306  // a record or table.
   307  func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error {
   308  	acw, err := NewArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx)
   309  	if err != nil {
   310  		return err
   311  	}
   312  	fw.colIdx += acw.leafCount
   313  	return acw.Write(fw.ctx)
   314  }
   315  
   316  // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked
   317  // it is based on the current column of the row group writer allowing progressive building
   318  // of the file by columns without needing a full record or table to write.
   319  func (fw *FileWriter) WriteColumnData(data arrow.Array) error {
   320  	chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data})
   321  	defer chunked.Release()
   322  	return fw.WriteColumnChunked(chunked, 0, int64(data.Len()))
   323  }