github.com/apache/arrow/go/v14@v14.0.1/parquet/file/row_group_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"github.com/apache/arrow/go/v14/parquet"
    21  	"github.com/apache/arrow/go/v14/parquet/internal/encryption"
    22  	"github.com/apache/arrow/go/v14/parquet/internal/utils"
    23  	"github.com/apache/arrow/go/v14/parquet/metadata"
    24  	"golang.org/x/xerrors"
    25  )
    26  
    27  // RowGroupWriter is the base interface for writing rowgroups, the actual writer
    28  // will be either the SerialRowGroupWriter or the BufferedRowGroupWriter
    29  type RowGroupWriter interface {
    30  	// Returns the number of columns for this row group writer
    31  	NumColumns() int
    32  	// returns the current number of rows that have been written.
    33  	// Returns an error if they are unequal between columns that have been written so far
    34  	NumRows() (int, error)
    35  	// The total compressed bytes so
    36  	TotalCompressedBytes() int64
    37  	// the total bytes written and flushed out
    38  	TotalBytesWritten() int64
    39  	// Closes any unclosed columnwriters, and closes the rowgroup, writing out
    40  	// the metadata. subsequent calls have no effect
    41  	// returns an error if columns contain unequal numbers of rows.
    42  	Close() error
    43  	// Buffered returns true if it's a BufferedRowGroupWriter and false for a
    44  	// SerialRowGroupWriter
    45  	Buffered() bool
    46  }
    47  
    48  // SerialRowGroupWriter expects each column to be written one after the other,
    49  // data is flushed every time NextColumn is called and will panic if there is
    50  // an unequal number of rows written per column.
    51  type SerialRowGroupWriter interface {
    52  	RowGroupWriter
    53  	NextColumn() (ColumnChunkWriter, error)
    54  	// returns the current column being built, if buffered it will equal NumColumns
    55  	// if serialized then it will return which column is currenly being written
    56  	CurrentColumn() int
    57  }
    58  
    59  // BufferedRowGroupWriter allows writing to multiple columns simultaneously, data
    60  // will not be flushed to the underlying writer until closing the RowGroupWriter.
    61  //
    62  // All columns must have equal numbers of rows before closing the row group or it will panic.
    63  type BufferedRowGroupWriter interface {
    64  	RowGroupWriter
    65  	Column(i int) (ColumnChunkWriter, error)
    66  }
    67  
    68  type rowGroupWriter struct {
    69  	sink          utils.WriterTell
    70  	metadata      *metadata.RowGroupMetaDataBuilder
    71  	props         *parquet.WriterProperties
    72  	bytesWritten  int64
    73  	closed        bool
    74  	ordinal       int16
    75  	nextColumnIdx int
    76  	nrows         int
    77  	buffered      bool
    78  	fileEncryptor encryption.FileEncryptor
    79  
    80  	columnWriters []ColumnChunkWriter
    81  	pager         PageWriter
    82  }
    83  
    84  func newRowGroupWriter(sink utils.WriterTell, metadata *metadata.RowGroupMetaDataBuilder, ordinal int16, props *parquet.WriterProperties, buffered bool, fileEncryptor encryption.FileEncryptor) *rowGroupWriter {
    85  	ret := &rowGroupWriter{
    86  		sink:          sink,
    87  		metadata:      metadata,
    88  		props:         props,
    89  		ordinal:       ordinal,
    90  		buffered:      buffered,
    91  		fileEncryptor: fileEncryptor,
    92  	}
    93  	if buffered {
    94  		ret.initColumns()
    95  	} else {
    96  		ret.columnWriters = []ColumnChunkWriter{nil}
    97  	}
    98  	return ret
    99  }
   100  
   101  func (rg *rowGroupWriter) Buffered() bool { return rg.buffered }
   102  
   103  func (rg *rowGroupWriter) checkRowsWritten() error {
   104  	if len(rg.columnWriters) == 0 {
   105  		return nil
   106  	}
   107  
   108  	if !rg.buffered && rg.columnWriters[0] != nil {
   109  		current := rg.columnWriters[0].RowsWritten()
   110  		if rg.nrows == 0 {
   111  			rg.nrows = current
   112  		} else if rg.nrows != current {
   113  			return xerrors.Errorf("row mismatch for unbuffered row group: %d, count expected: %d, actual: %d", rg.ordinal, current, rg.nrows)
   114  		}
   115  	} else if rg.buffered {
   116  		current := rg.columnWriters[0].RowsWritten()
   117  		for i, wr := range rg.columnWriters[1:] {
   118  			if current != wr.RowsWritten() {
   119  				return xerrors.Errorf("row mismatch for buffered row group: %d, column: %d, count expected: %d, actual: %d", rg.ordinal, i+1, current, wr.RowsWritten())
   120  			}
   121  		}
   122  		rg.nrows = current
   123  	}
   124  	return nil
   125  }
   126  
   127  func (rg *rowGroupWriter) NumColumns() int { return rg.metadata.NumColumns() }
   128  func (rg *rowGroupWriter) NumRows() (int, error) {
   129  	err := rg.checkRowsWritten()
   130  	return rg.nrows, err
   131  }
   132  
   133  func (rg *rowGroupWriter) NextColumn() (ColumnChunkWriter, error) {
   134  	if rg.buffered {
   135  		panic("next column is not supported when a rowgroup is written by size")
   136  	}
   137  	if rg.columnWriters[0] != nil {
   138  		if err := rg.checkRowsWritten(); err != nil {
   139  			return nil, err
   140  		}
   141  	}
   142  
   143  	// throw an error if more columns are being written
   144  	colMeta := rg.metadata.NextColumnChunk()
   145  	if rg.columnWriters[0] != nil {
   146  		if err := rg.columnWriters[0].Close(); err != nil {
   147  			return nil, err
   148  		}
   149  		rg.bytesWritten += rg.columnWriters[0].TotalBytesWritten()
   150  	}
   151  	rg.nextColumnIdx++
   152  
   153  	path := colMeta.Descr().Path()
   154  	var (
   155  		metaEncryptor encryption.Encryptor
   156  		dataEncryptor encryption.Encryptor
   157  	)
   158  	if rg.fileEncryptor != nil {
   159  		metaEncryptor = rg.fileEncryptor.GetColumnMetaEncryptor(path)
   160  		dataEncryptor = rg.fileEncryptor.GetColumnDataEncryptor(path)
   161  	}
   162  
   163  	if rg.pager == nil {
   164  		var err error
   165  		rg.pager, err = NewPageWriter(rg.sink, rg.props.CompressionFor(path), rg.props.CompressionLevelFor(path), colMeta, rg.ordinal, int16(rg.nextColumnIdx-1), rg.props.Allocator(), false, metaEncryptor, dataEncryptor)
   166  		if err != nil {
   167  			return nil, err
   168  		}
   169  	} else {
   170  		rg.pager.Reset(rg.sink, rg.props.CompressionFor(path), rg.props.CompressionLevelFor(path), colMeta, rg.ordinal, int16(rg.nextColumnIdx-1), metaEncryptor, dataEncryptor)
   171  	}
   172  
   173  	rg.columnWriters[0] = NewColumnChunkWriter(colMeta, rg.pager, rg.props)
   174  	return rg.columnWriters[0], nil
   175  }
   176  
   177  func (rg *rowGroupWriter) Column(i int) (ColumnChunkWriter, error) {
   178  	if !rg.buffered {
   179  		panic("column is only supported when a bufferedrowgroup is being written")
   180  	}
   181  
   182  	if i >= 0 && i < len(rg.columnWriters) {
   183  		return rg.columnWriters[i], nil
   184  	}
   185  	return nil, xerrors.Errorf("invalid column number requested: %d", i)
   186  }
   187  
   188  func (rg *rowGroupWriter) CurrentColumn() int { return rg.metadata.CurrentColumn() }
   189  func (rg *rowGroupWriter) TotalCompressedBytes() int64 {
   190  	total := int64(0)
   191  	for _, wr := range rg.columnWriters {
   192  		if wr != nil {
   193  			total += wr.TotalCompressedBytes()
   194  		}
   195  	}
   196  	return total
   197  }
   198  
   199  func (rg *rowGroupWriter) TotalBytesWritten() int64 {
   200  	total := int64(0)
   201  	for _, wr := range rg.columnWriters {
   202  		if wr != nil {
   203  			total += wr.TotalBytesWritten()
   204  		}
   205  	}
   206  	return total + rg.bytesWritten
   207  }
   208  
   209  func (rg *rowGroupWriter) Close() error {
   210  	if !rg.closed {
   211  		rg.closed = true
   212  		if err := rg.checkRowsWritten(); err != nil {
   213  			return err
   214  		}
   215  
   216  		for _, wr := range rg.columnWriters {
   217  			if wr != nil {
   218  				if err := wr.Close(); err != nil {
   219  					return err
   220  				}
   221  				rg.bytesWritten += wr.TotalBytesWritten()
   222  			}
   223  		}
   224  
   225  		rg.columnWriters = nil
   226  		rg.metadata.SetNumRows(rg.nrows)
   227  		rg.metadata.Finish(rg.bytesWritten, rg.ordinal)
   228  	}
   229  	return nil
   230  }
   231  
   232  func (rg *rowGroupWriter) initColumns() error {
   233  	if rg.columnWriters == nil {
   234  		rg.columnWriters = make([]ColumnChunkWriter, 0, rg.NumColumns())
   235  	}
   236  	for i := 0; i < rg.NumColumns(); i++ {
   237  		colMeta := rg.metadata.NextColumnChunk()
   238  		path := colMeta.Descr().Path()
   239  		var (
   240  			metaEncryptor encryption.Encryptor
   241  			dataEncryptor encryption.Encryptor
   242  		)
   243  		if rg.fileEncryptor != nil {
   244  			metaEncryptor = rg.fileEncryptor.GetColumnMetaEncryptor(path)
   245  			dataEncryptor = rg.fileEncryptor.GetColumnDataEncryptor(path)
   246  		}
   247  		pager, err := NewPageWriter(rg.sink, rg.props.CompressionFor(path), rg.props.CompressionLevelFor(path), colMeta, rg.ordinal, int16(rg.nextColumnIdx), rg.props.Allocator(), rg.buffered, metaEncryptor, dataEncryptor)
   248  		if err != nil {
   249  			return err
   250  		}
   251  		rg.nextColumnIdx++
   252  		rg.columnWriters = append(rg.columnWriters, NewColumnChunkWriter(colMeta, pager, rg.props))
   253  	}
   254  	return nil
   255  }