github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/file_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package pqarrow
    18  
    19  import (
    20  	"context"
    21  	"encoding/base64"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v7/arrow"
    25  	"github.com/apache/arrow/go/v7/arrow/flight"
    26  	"github.com/apache/arrow/go/v7/parquet"
    27  	"github.com/apache/arrow/go/v7/parquet/file"
    28  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    29  	"github.com/apache/arrow/go/v7/parquet/metadata"
    30  	"golang.org/x/xerrors"
    31  )
    32  
    33  // WriteTable is a convenience function to create and write a full array.Table to a parquet file. The schema
    34  // and columns will be determined by the schema of the table, writing the file out to the the provided writer.
    35  // The chunksize will be utilized in order to determine the size of the row groups.
    36  func WriteTable(tbl arrow.Table, w io.Writer, chunkSize int64, props *parquet.WriterProperties, arrprops ArrowWriterProperties) error {
    37  	writer, err := NewFileWriter(tbl.Schema(), w, props, arrprops)
    38  	if err != nil {
    39  		return err
    40  	}
    41  
    42  	if err := writer.WriteTable(tbl, chunkSize); err != nil {
    43  		return err
    44  	}
    45  
    46  	return writer.Close()
    47  }
    48  
    49  // FileWriter is an object for writing Arrow directly to a parquet file.
    50  type FileWriter struct {
    51  	wr         *file.Writer
    52  	schema     *arrow.Schema
    53  	manifest   *SchemaManifest
    54  	rgw        file.RowGroupWriter
    55  	arrowProps ArrowWriterProperties
    56  	ctx        context.Context
    57  	colIdx     int
    58  	closed     bool
    59  }
    60  
    61  // NewFileWriter returns a writer for writing Arrow directly to a parquetfile, rather than
    62  // the ArrowColumnWriter and WriteArrow functions which allow writing arrow to an existing
    63  // file.Writer, this will create a new file.Writer based on the schema provided.
    64  func NewFileWriter(arrschema *arrow.Schema, w io.Writer, props *parquet.WriterProperties, arrprops ArrowWriterProperties) (*FileWriter, error) {
    65  	if props == nil {
    66  		props = parquet.NewWriterProperties()
    67  	}
    68  
    69  	pqschema, err := ToParquet(arrschema, props, arrprops)
    70  	if err != nil {
    71  		return nil, err
    72  	}
    73  
    74  	meta := make(metadata.KeyValueMetadata, 0)
    75  	if arrprops.storeSchema {
    76  		for i := 0; i < arrschema.Metadata().Len(); i++ {
    77  			meta.Append(arrschema.Metadata().Keys()[i], arrschema.Metadata().Values()[i])
    78  		}
    79  
    80  		serializedSchema := flight.SerializeSchema(arrschema, props.Allocator())
    81  		meta.Append("ARROW:schema", base64.RawStdEncoding.EncodeToString(serializedSchema))
    82  	}
    83  
    84  	schemaNode := pqschema.Root()
    85  	baseWriter := file.NewParquetWriter(w, schemaNode, file.WithWriterProps(props), file.WithWriteMetadata(meta))
    86  
    87  	manifest, err := NewSchemaManifest(pqschema, nil, &ArrowReadProperties{})
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	return &FileWriter{wr: baseWriter, schema: arrschema, manifest: manifest, arrowProps: arrprops, ctx: NewArrowWriteContext(context.TODO(), &arrprops)}, nil
    93  }
    94  
    95  // NewRowGroup does what it says on the tin, creates a new row group in the underlying file.
    96  // Equivalent to `AppendRowGroup` on a file.Writer
    97  func (fw *FileWriter) NewRowGroup() {
    98  	if fw.rgw != nil {
    99  		fw.rgw.Close()
   100  	}
   101  	fw.rgw = fw.wr.AppendRowGroup()
   102  	fw.colIdx = 0
   103  }
   104  
   105  // NewBufferedRowGroup starts a new memory Buffered Row Group to allow writing columns / records
   106  // without immediately flushing them to disk. This allows using WriteBuffered to write records
   107  // and decide where to break your rowgroup based on the TotalBytesWritten rather than on the max
   108  // row group len. If using Records, this should be paired with WriteBuffered, while
   109  // Write will always write a new record as a row group in and of itself.
   110  func (fw *FileWriter) NewBufferedRowGroup() {
   111  	if fw.rgw != nil {
   112  		fw.rgw.Close()
   113  	}
   114  	fw.rgw = fw.wr.AppendBufferedRowGroup()
   115  	fw.colIdx = 0
   116  }
   117  
   118  // RowGroupTotalCompressedBytes returns the total number of bytes after compression
   119  // that have been written to the current row group so far.
   120  func (fw *FileWriter) RowGroupTotalCompressedBytes() int64 {
   121  	if fw.rgw != nil {
   122  		return fw.rgw.TotalCompressedBytes()
   123  	}
   124  	return 0
   125  }
   126  
   127  // RowGroupTotalBytesWritten returns the total number of bytes written and flushed out in
   128  // the current row group.
   129  func (fw *FileWriter) RowGroupTotalBytesWritten() int64 {
   130  	if fw.rgw != nil {
   131  		return fw.rgw.TotalBytesWritten()
   132  	}
   133  	return 0
   134  }
   135  
   136  func (fw *FileWriter) WriteBuffered(rec arrow.Record) error {
   137  	if !rec.Schema().Equal(fw.schema) {
   138  		return xerrors.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   139  	}
   140  
   141  	var (
   142  		recList []arrow.Record
   143  		maxRows = fw.wr.Properties().MaxRowGroupLength()
   144  		curRows int
   145  		err     error
   146  	)
   147  	if fw.rgw != nil {
   148  		if curRows, err = fw.rgw.NumRows(); err != nil {
   149  			return err
   150  		}
   151  	} else {
   152  		fw.NewBufferedRowGroup()
   153  	}
   154  
   155  	if int64(curRows)+rec.NumRows() <= maxRows {
   156  		recList = []arrow.Record{rec}
   157  	} else {
   158  		recList = []arrow.Record{rec.NewSlice(0, maxRows-int64(curRows))}
   159  		defer recList[0].Release()
   160  		for offset := maxRows - int64(curRows); offset < rec.NumRows(); offset += maxRows {
   161  			s := rec.NewSlice(offset, offset+utils.Min(maxRows, rec.NumRows()-offset))
   162  			defer s.Release()
   163  			recList = append(recList, s)
   164  		}
   165  	}
   166  
   167  	for idx, r := range recList {
   168  		if idx > 0 {
   169  			fw.NewBufferedRowGroup()
   170  		}
   171  		for i := 0; i < int(r.NumCols()); i++ {
   172  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   173  				fw.Close()
   174  				return err
   175  			}
   176  		}
   177  	}
   178  	fw.colIdx = 0
   179  	return nil
   180  }
   181  
   182  // Write an arrow Record Batch to the file, respecting the MaxRowGroupLength in the writer
   183  // properties to determine whether or not a new row group is created while writing.
   184  func (fw *FileWriter) Write(rec arrow.Record) error {
   185  	if !rec.Schema().Equal(fw.schema) {
   186  		return xerrors.Errorf("record schema does not match writer's. \nrecord: %s\nwriter: %s", rec.Schema(), fw.schema)
   187  	}
   188  
   189  	var recList []arrow.Record
   190  	rowgroupLen := fw.wr.Properties().MaxRowGroupLength()
   191  	if rec.NumRows() > rowgroupLen {
   192  		recList = make([]arrow.Record, 0)
   193  		for offset := int64(0); offset < rec.NumRows(); offset += rowgroupLen {
   194  			s := rec.NewSlice(offset, offset+utils.Min(rowgroupLen, rec.NumRows()-offset))
   195  			defer s.Release()
   196  			recList = append(recList, s)
   197  		}
   198  	} else {
   199  		recList = []arrow.Record{rec}
   200  	}
   201  
   202  	for _, r := range recList {
   203  		fw.NewRowGroup()
   204  		for i := 0; i < int(r.NumCols()); i++ {
   205  			if err := fw.WriteColumnData(r.Column(i)); err != nil {
   206  				fw.Close()
   207  				return err
   208  			}
   209  		}
   210  	}
   211  	fw.colIdx = 0
   212  	return nil
   213  }
   214  
   215  // WriteTable writes an arrow table to the underlying file using chunkSize to determine
   216  // the size to break at for making row groups. Writing a table will always create a new
   217  // row group for each chunk of chunkSize rows in the table. Calling this with 0 rows will
   218  // still write a 0 length Row Group to the file.
   219  func (fw *FileWriter) WriteTable(tbl arrow.Table, chunkSize int64) error {
   220  	if chunkSize <= 0 && tbl.NumRows() > 0 {
   221  		return xerrors.New("chunk size per row group must be greater than 0")
   222  	} else if !tbl.Schema().Equal(fw.schema) {
   223  		return xerrors.Errorf("table schema does not match writer's. \nTable: %s\n writer: %s", tbl.Schema(), fw.schema)
   224  	} else if chunkSize > fw.wr.Properties().MaxRowGroupLength() {
   225  		chunkSize = fw.wr.Properties().MaxRowGroupLength()
   226  	}
   227  
   228  	writeRowGroup := func(offset, size int64) error {
   229  		fw.NewRowGroup()
   230  		for i := 0; i < int(tbl.NumCols()); i++ {
   231  			if err := fw.WriteColumnChunked(tbl.Column(i).Data(), offset, size); err != nil {
   232  				return err
   233  			}
   234  		}
   235  		return nil
   236  	}
   237  
   238  	if tbl.NumRows() == 0 {
   239  		if err := writeRowGroup(0, 0); err != nil {
   240  			fw.Close()
   241  			return err
   242  		}
   243  		return nil
   244  	}
   245  
   246  	for offset := int64(0); offset < tbl.NumRows(); offset += chunkSize {
   247  		if err := writeRowGroup(offset, utils.Min(chunkSize, tbl.NumRows()-offset)); err != nil {
   248  			fw.Close()
   249  			return err
   250  		}
   251  	}
   252  	return nil
   253  }
   254  
   255  // Close flushes out the data and closes the file. It can be called multiple times,
   256  // subsequent calls after the first will have no effect.
   257  func (fw *FileWriter) Close() error {
   258  	if !fw.closed {
   259  		fw.closed = true
   260  		if fw.rgw != nil {
   261  			if err := fw.rgw.Close(); err != nil {
   262  				return err
   263  			}
   264  		}
   265  		return fw.wr.Close()
   266  	}
   267  	return nil
   268  }
   269  
   270  // WriteColumnChunked will write the data provided to the underlying file, using the provided
   271  // offset and size to allow writing subsets of data from the chunked column. It uses the current
   272  // column in the underlying row group writer as the starting point, allowing progressive
   273  // building of writing columns to a file via arrow data without needing to already have
   274  // a record or table.
   275  func (fw *FileWriter) WriteColumnChunked(data *arrow.Chunked, offset, size int64) error {
   276  	acw, err := NewArrowColumnWriter(data, offset, size, fw.manifest, fw.rgw, fw.colIdx)
   277  	if err != nil {
   278  		return err
   279  	}
   280  	fw.colIdx += acw.leafCount
   281  	return acw.Write(fw.ctx)
   282  }
   283  
   284  // WriteColumnData writes the entire array to the file as the next columns. Like WriteColumnChunked
   285  // it is based on the current column of the row group writer allowing progressive building
   286  // of the file by columns without needing a full record or table to write.
   287  func (fw *FileWriter) WriteColumnData(data arrow.Array) error {
   288  	chunked := arrow.NewChunked(data.DataType(), []arrow.Array{data})
   289  	defer chunked.Release()
   290  	return fw.WriteColumnChunked(chunked, 0, int64(data.Len()))
   291  }