github.com/apache/arrow/go/v10@v10.0.1/parquet/pqarrow/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  	"encoding/base64"
    22  	"fmt"
    23  	"io"
    24  
    25  	"github.com/apache/arrow/go/v10/arrow"
    26  	"github.com/apache/arrow/go/v10/arrow/flight"
    27  	"github.com/apache/arrow/go/v10/internal/utils"
    28  	"github.com/apache/arrow/go/v10/parquet"
    29  	"github.com/apache/arrow/go/v10/parquet/file"
    30  	"github.com/apache/arrow/go/v10/parquet/metadata"
    31  	"golang.org/x/xerrors"
    32  )
    33  
    34  // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema
    35  // and columns will be determined by the schema of the table, writing the file out to the the provided writer.
    36  // The chunksize will be utilized in order to determine the size of the row groups.
    37  func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error {
    38  	writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops)
    39  	if err != nil {
    40  		return err
    41  	}
    42  
    43  	if err := writer.WriteTable(tbl, chunkSize); err != nil {
    44  		return err
    45  	}
    46  
    47  	return writer.Close()
    48  }
    49  
    50  // FileWriter is an object for writing Arrow directly to a parquet file.
    51  type FileWriter struct {
    52  	wr         *file.Writer
    53  	schema     *arrow.Schema
    54  	manifest   *SchemaManifest
    55  	rgw        file.RowGroupWriter
    56  	arrowProps ArrowWriterProperties
    57  	ctx        context.Context
    58  	colIdx     int
    59  	closed     bool
    60  }
    61  
    62  // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than
    63  // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing
    64  // file.Writer, this will create a new file.Writer based on the schema provided.
    65  func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) {
    66  	if props == nil {
    67  		props = parquet.NewWriterProperties()
    68  	}
    69  
    70  	pqschema, err := ToParquet(arrschema, props, arrprops)
    71  	if err != nil {
    72  		return nil, err
    73  	}
    74  
    75  	meta := make(metadata.KeyValueMetadata, 0)
    76  	for i := 0; i < arrschema.Metadata().Len(); i++ {
    77  		meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i])
    78  	}
    79  
    80  	if arrprops.storeSchema {
    81  		serializedSchema := flight.SerializeSchema(arrschema, props.Allocator())
    82  		meta.Append("ARROW:schema", base64.StdEncoding.EncodeToString(serializedSchema))
    83  	}
    84  
    85  	schemaNode := pqschema.Root()
    86  	baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta))
    87  
    88  	manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{})
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  
    93  	return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil
    94  }
    95  
    96  // NewRowGroup does what it says on the tin, creates a new row group in the underlying file.
    97  // Equivalent to `AppendRowGroup` on a file.Writer
    98  func (fw *FileWriter) NewRowGroup() {
    99  	if fw.rgw != nil {
   100  		fw.rgw.Close()
   101  	}
   102  	fw.rgw = fw.wr.AppendRowGroup()
   103  	fw.colIdx = 0
   104  }
   105  
   106  // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records
   107  // without immediately flushing them to disk. This allows using WriteBuffered to write records
   108  // and decide where to break your rowgroup based on the TotalBytesWritten rather than on the max
   109  // row group len. If using Records, this should be paired with WriteBuffered, while
   110  // Write will always write a new record as a row group in and of itself.
   111  func (fw *FileWriter) NewBufferedRowGroup() {
   112  	if fw.rgw != nil {
   113  		fw.rgw.Close()
   114  	}
   115  	fw.rgw = fw.wr.AppendBufferedRowGroup()
   116  	fw.colIdx = 0
   117  }
   118  
   119  // RowGroupTotalCompressedBytes returns the total number of bytes after compression
   120  // that have been written to the current row group so far.
   121  func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 {
   122  	if fw.rgw != nil {
   123  		return fw.rgw.TotalCompressedBytes()
   124  	}
   125  	return 0
   126  }
   127  
   128  // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in
   129  // the current row group.
   130  func (fw *FileWriter) RowGroupTotalBytesWritten() int64 {
   131  	if fw.rgw != nil {
   132  		return fw.rgw.TotalBytesWritten()
   133  	}
   134  	return 0
   135  }
   136  
   137  func (fw *FileWriter) WriteBuffered(rec arrow.Record) error {
   138  	if !rec.Schema().Equal(fw.schema) {
   139  		return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   140  	}
   141  
   142  	var (
   143  		recList []arrow.Record
   144  		maxRows = fw.wr.Properties().MaxRowGroupLength()
   145  		curRows int
   146  		err     error
   147  	)
   148  	if fw.rgw != nil {
   149  		if curRows, err = fw.rgw.NumRows(); err != nil {
   150  			return err
   151  		}
   152  	} else {
   153  		fw.NewBufferedRowGroup()
   154  	}
   155  
   156  	if int64(curRows)+rec.NumRows() <= maxRows {
   157  		recList = []arrow.Record{rec}
   158  	} else {
   159  		recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))}
   160  		defer recList[0].Release()
   161  		for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows {
   162  			s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset))
   163  			defer s.Release()
   164  			recList = append(recList, s)
   165  		}
   166  	}
   167  
   168  	for idx, r := range recList {
   169  		if idx > 0 {
   170  			fw.NewBufferedRowGroup()
   171  		}
   172  		for i := 0; i < int(r.NumCols()); i++ {
   173  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   174  				fw.Close()
   175  				return err
   176  			}
   177  		}
   178  	}
   179  	fw.colIdx = 0
   180  	return nil
   181  }
   182  
   183  // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer
   184  // properties to determine whether or not a new row group is created while writing.
   185  func (fw *FileWriter) Write(rec arrow.Record) error {
   186  	if !rec.Schema().Equal(fw.schema) {
   187  		return fmt.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   188  	}
   189  
   190  	var recList []arrow.Record
   191  	rowgroupLen := fw.wr.Properties().MaxRowGroupLength()
   192  	if rec.NumRows() > rowgroupLen {
   193  		recList = make([]arrow.Record, 0)
   194  		for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen {
   195  			s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset))
   196  			defer s.Release()
   197  			recList = append(recList, s)
   198  		}
   199  	} else {
   200  		recList = []arrow.Record{rec}
   201  	}
   202  
   203  	for _, r := range recList {
   204  		fw.NewRowGroup()
   205  		for i := 0; i < int(r.NumCols()); i++ {
   206  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   207  				fw.Close()
   208  				return err
   209  			}
   210  		}
   211  	}
   212  	fw.colIdx = 0
   213  	return nil
   214  }
   215  
   216  // WriteTable writes an arrow table to the underlying file using chunkSize to determine
   217  // the size to break at for making row groups. Writing a table will always create a new
   218  // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will
   219  // still write a 0 length Row Group to the file.
   220  func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error {
   221  	if chunkSize <= 0 && tbl.NumRows() > 0 {
   222  		return xerrors.New("chunk size per row group must be greater than 0")
   223  	} else if !tbl.Schema().Equal(fw.schema) {
   224  		return fmt.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema)
   225  	} else if chunkSize > fw.wr.Properties().MaxRowGroupLength() {
   226  		chunkSize = fw.wr.Properties().MaxRowGroupLength()
   227  	}
   228  
   229  	writeRowGroup := func(offset, size int64) error {
   230  		fw.NewRowGroup()
   231  		for i := 0; i < int(tbl.NumCols()); i++ {
   232  			if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil {
   233  				return err
   234  			}
   235  		}
   236  		return nil
   237  	}
   238  
   239  	if tbl.NumRows() == 0 {
   240  		if err := writeRowGroup(0, 0); err != nil {
   241  			fw.Close()
   242  			return err
   243  		}
   244  		return nil
   245  	}
   246  
   247  	for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize {
   248  		if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil {
   249  			fw.Close()
   250  			return err
   251  		}
   252  	}
   253  	return nil
   254  }
   255  
   256  // Close flushes out the data and closes the file. It can be called multiple times,
   257  // subsequent calls after the first will have no effect.
   258  func (fw *FileWriter) Close() error {
   259  	if !fw.closed {
   260  		fw.closed = true
   261  		if fw.rgw != nil {
   262  			if err := fw.rgw.Close(); err != nil {
   263  				return err
   264  			}
   265  		}
   266  		return fw.wr.Close()
   267  	}
   268  	return nil
   269  }
   270  
   271  // WriteColumnChunked will write the data provided to the underlying file, using the provided
   272  // offset and size to allow writing subsets of data from the chunked column. It uses the current
   273  // column in the underlying row group writer as the starting point, allowing progressive
   274  // building of writing columns to a file via arrow data without needing to already have
   275  // a record or table.
   276  func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error {
   277  	acw, err := NewArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx)
   278  	if err != nil {
   279  		return err
   280  	}
   281  	fw.colIdx += acw.leafCount
   282  	return acw.Write(fw.ctx)
   283  }
   284  
   285  // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked
   286  // it is based on the current column of the row group writer allowing progressive building
   287  // of the file by columns without needing a full record or table to write.
   288  func (fw *FileWriter) WriteColumnData(data arrow.Array) error {
   289  	chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data})
   290  	defer chunked.Release()
   291  	return fw.WriteColumnChunked(chunked, 0, int64(data.Len()))
   292  }