github.com/apache/arrow/go/v14@v14.0.2/parquet/file/column_writer.go

github.com/apache/arrow/go/v14@v14.0.2/parquet/file/column_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v14/arrow"
    25  	"github.com/apache/arrow/go/v14/arrow/array"
    26  	"github.com/apache/arrow/go/v14/arrow/bitutil"
    27  	"github.com/apache/arrow/go/v14/arrow/memory"
    28  	"github.com/apache/arrow/go/v14/parquet"
    29  	"github.com/apache/arrow/go/v14/parquet/internal/encoding"
    30  	"github.com/apache/arrow/go/v14/parquet/metadata"
    31  	"github.com/apache/arrow/go/v14/parquet/schema"
    32  )
    33  
    34  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_writer_types.gen.go.tmpl
    35  
    36  // ColumnChunkWriter is the base interface for all columnwriters. To directly write
    37  // data to the column, you need to assert it to the correctly typed ColumnChunkWriter
    38  // instance, such as Int32ColumnWriter.
    39  type ColumnChunkWriter interface {
    40  	// Close ends this column and returns the number of bytes written
    41  	Close() error
    42  	// Type returns the underlying physical parquet type for this column
    43  	Type() parquet.Type
    44  	// Descr returns the column information for this writer
    45  	Descr() *schema.Column
    46  	// RowsWritten returns the number of rows that have so far been written with this writer
    47  	RowsWritten() int
    48  	// TotalCompressedBytes returns the number of bytes, after compression, that have been written so far
    49  	TotalCompressedBytes() int64
    50  	// TotalBytesWritten includes the bytes for writing dictionary pages, while TotalCompressedBytes is
    51  	// just the data and page headers
    52  	TotalBytesWritten() int64
    53  	// Properties returns the current WriterProperties in use for this writer
    54  	Properties() *parquet.WriterProperties
    55  	// CurrentEncoder returns the current encoder that is being used
    56  	// to encode new data written to this column
    57  	CurrentEncoder() encoding.TypedEncoder
    58  	// FallbackToPlain forces a dictionary encoded column writer to
    59  	// fallback to plain encoding, first flushing out any data it has
    60  	// and then changing the encoder to use plain encoding from
    61  	// here on out.
    62  	//
    63  	// This is automatically called if the dictionary reaches the
    64  	// limit in the write properties or under specific conditions.
    65  	//
    66  	// Has no effect if the column is not currently dictionary encoded.
    67  	FallbackToPlain()
    68  	// PageStatistics returns the current page statistics for this
    69  	// column writer. May be nil if stats are not enabled.
    70  	PageStatistics() metadata.TypedStatistics
    71  	// WriteDictIndices writes an arrow array of dictionary indices
    72  	// to this column. This should only be called by pqarrow or
    73  	// if you *really* know what you're doing.
    74  	WriteDictIndices(arrow.Array, []int16, []int16) error
    75  
    76  	LevelInfo() LevelInfo
    77  	SetBitsBuffer(*memory.Buffer)
    78  	HasBitsBuffer() bool
    79  }
    80  
    81  func computeLevelInfo(descr *schema.Column) (info LevelInfo) {
    82  	info.DefLevel = descr.MaxDefinitionLevel()
    83  	info.RepLevel = descr.MaxRepetitionLevel()
    84  
    85  	minSpacedDefLevel := descr.MaxDefinitionLevel()
    86  	n := descr.SchemaNode()
    87  	for n != nil && n.RepetitionType() != parquet.Repetitions.Repeated {
    88  		if n.RepetitionType() == parquet.Repetitions.Optional {
    89  			minSpacedDefLevel--
    90  		}
    91  		n = n.Parent()
    92  	}
    93  	info.RepeatedAncestorDefLevel = minSpacedDefLevel
    94  	return
    95  }
    96  
    97  type columnWriter struct {
    98  	metaData *metadata.ColumnChunkMetaDataBuilder
    99  	descr    *schema.Column
   100  
   101  	// scratch buffer if validity bits need to be recalculated
   102  	bitsBuffer *memory.Buffer
   103  	levelInfo  LevelInfo
   104  	pager      PageWriter
   105  	hasDict    bool
   106  	encoding   parquet.Encoding
   107  	props      *parquet.WriterProperties
   108  	defEncoder encoding.LevelEncoder
   109  	repEncoder encoding.LevelEncoder
   110  	mem        memory.Allocator
   111  
   112  	pageStatistics  metadata.TypedStatistics
   113  	chunkStatistics metadata.TypedStatistics
   114  
   115  	// total number of values stored in the current data page. this is the maximum
   116  	// of the number of encoded def levels or encoded values. for
   117  	// non-repeated, required columns, this is equal to the number of encoded
   118  	// values. For repeated or optional values, there may be fewer data values
   119  	// than levels, and this tells you how many encoded levels there are in that case
   120  	numBufferedValues int64
   121  
   122  	// total number of rows stored in the current data page. This may be larger
   123  	// than numBufferedValues when writing a column with repeated values. This is
   124  	// the number of rows written since the last time we flushed a page.
   125  	numBufferedRows int
   126  
   127  	// the total number of stored values in the current page. for repeated or optional
   128  	// values. this number may be lower than numBuffered
   129  	numDataValues int64
   130  
   131  	rowsWritten       int
   132  	totalBytesWritten int64
   133  	// records the current number of compressed bytes in a column
   134  	totalCompressedBytes int64
   135  	closed               bool
   136  	fallbackToNonDict    bool
   137  
   138  	pages []DataPage
   139  
   140  	defLevelSink *encoding.PooledBufferWriter
   141  	repLevelSink *encoding.PooledBufferWriter
   142  
   143  	uncompressedData bytes.Buffer
   144  	compressedTemp   *bytes.Buffer
   145  
   146  	currentEncoder encoding.TypedEncoder
   147  }
   148  
   149  func newColumnWriterBase(metaData *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) columnWriter {
   150  	ret := columnWriter{
   151  		metaData:     metaData,
   152  		descr:        metaData.Descr(),
   153  		levelInfo:    computeLevelInfo(metaData.Descr()),
   154  		pager:        pager,
   155  		hasDict:      useDict,
   156  		encoding:     enc,
   157  		props:        props,
   158  		mem:          props.Allocator(),
   159  		defLevelSink: encoding.NewPooledBufferWriter(0),
   160  		repLevelSink: encoding.NewPooledBufferWriter(0),
   161  	}
   162  	if pager.HasCompressor() {
   163  		ret.compressedTemp = new(bytes.Buffer)
   164  	}
   165  	if props.StatisticsEnabledFor(ret.descr.Path()) && ret.descr.SortOrder() != schema.SortUNKNOWN {
   166  		ret.pageStatistics = metadata.NewStatistics(ret.descr, props.Allocator())
   167  		ret.chunkStatistics = metadata.NewStatistics(ret.descr, props.Allocator())
   168  	}
   169  
   170  	ret.defEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxDefinitionLevel(), ret.defLevelSink)
   171  	ret.repEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxRepetitionLevel(), ret.repLevelSink)
   172  
   173  	ret.reset()
   174  
   175  	return ret
   176  }
   177  
   178  func (w *columnWriter) CurrentEncoder() encoding.TypedEncoder    { return w.currentEncoder }
   179  func (w *columnWriter) HasBitsBuffer() bool                      { return w.bitsBuffer != nil }
   180  func (w *columnWriter) SetBitsBuffer(buf *memory.Buffer)         { w.bitsBuffer = buf }
   181  func (w *columnWriter) PageStatistics() metadata.TypedStatistics { return w.pageStatistics }
   182  func (w *columnWriter) LevelInfo() LevelInfo                     { return w.levelInfo }
   183  
   184  func (w *columnWriter) Type() parquet.Type {
   185  	return w.descr.PhysicalType()
   186  }
   187  
   188  func (w *columnWriter) Descr() *schema.Column {
   189  	return w.descr
   190  }
   191  
   192  func (w *columnWriter) Properties() *parquet.WriterProperties {
   193  	return w.props
   194  }
   195  
   196  func (w *columnWriter) TotalCompressedBytes() int64 {
   197  	return w.totalCompressedBytes
   198  }
   199  
   200  func (w *columnWriter) TotalBytesWritten() int64 {
   201  	return w.totalBytesWritten
   202  }
   203  
   204  func (w *columnWriter) RowsWritten() int {
   205  	return w.rowsWritten + w.numBufferedRows
   206  }
   207  
   208  func (w *columnWriter) WriteDataPage(page DataPage) error {
   209  	written, err := w.pager.WriteDataPage(page)
   210  	w.totalBytesWritten += written
   211  	return err
   212  }
   213  
   214  func (w *columnWriter) WriteDefinitionLevels(levels []int16) {
   215  	w.defEncoder.EncodeNoFlush(levels)
   216  }
   217  
   218  func (w *columnWriter) WriteRepetitionLevels(levels []int16) {
   219  	w.repEncoder.EncodeNoFlush(levels)
   220  }
   221  
   222  func (w *columnWriter) reset() {
   223  	w.defLevelSink.Reset(0)
   224  	w.repLevelSink.Reset(0)
   225  
   226  	if w.props.DataPageVersion() == parquet.DataPageV1 {
   227  		// offset the buffers to make room to record the number of levels at the
   228  		// beginning of each after we've encoded them with RLE
   229  		if w.descr.MaxDefinitionLevel() > 0 {
   230  			w.defLevelSink.SetOffset(arrow.Uint32SizeBytes)
   231  		}
   232  		if w.descr.MaxRepetitionLevel() > 0 {
   233  			w.repLevelSink.SetOffset(arrow.Uint32SizeBytes)
   234  		}
   235  	}
   236  
   237  	w.defEncoder.Reset(w.descr.MaxDefinitionLevel())
   238  	w.repEncoder.Reset(w.descr.MaxRepetitionLevel())
   239  }
   240  
   241  func (w *columnWriter) concatBuffers(defLevelsSize, repLevelsSize int32, values []byte, wr io.Writer) {
   242  	wr.Write(w.repLevelSink.Bytes()[:repLevelsSize])
   243  	wr.Write(w.defLevelSink.Bytes()[:defLevelsSize])
   244  	wr.Write(values)
   245  }
   246  
   247  func (w *columnWriter) EstimatedBufferedValueBytes() int64 {
   248  	return w.currentEncoder.EstimatedDataEncodedSize()
   249  }
   250  
   251  func (w *columnWriter) commitWriteAndCheckPageLimit(numLevels, numValues int64) error {
   252  	w.numBufferedValues += numLevels
   253  	w.numDataValues += numValues
   254  
   255  	enc := w.currentEncoder.EstimatedDataEncodedSize()
   256  	if enc >= w.props.DataPageSize() {
   257  		return w.FlushCurrentPage()
   258  	}
   259  	return nil
   260  }
   261  
   262  func (w *columnWriter) FlushCurrentPage() error {
   263  	var (
   264  		defLevelsRLESize int32 = 0
   265  		repLevelsRLESize int32 = 0
   266  	)
   267  
   268  	values, err := w.currentEncoder.FlushValues()
   269  	if err != nil {
   270  		return err
   271  	}
   272  	defer values.Release()
   273  
   274  	isV1DataPage := w.props.DataPageVersion() == parquet.DataPageV1
   275  	if w.descr.MaxDefinitionLevel() > 0 {
   276  		w.defEncoder.Flush()
   277  		w.defLevelSink.SetOffset(0)
   278  		sz := w.defEncoder.Len()
   279  		if isV1DataPage {
   280  			sz += arrow.Uint32SizeBytes
   281  			binary.LittleEndian.PutUint32(w.defLevelSink.Bytes(), uint32(w.defEncoder.Len()))
   282  		}
   283  		defLevelsRLESize = int32(sz)
   284  	}
   285  
   286  	if w.descr.MaxRepetitionLevel() > 0 {
   287  		w.repEncoder.Flush()
   288  		w.repLevelSink.SetOffset(0)
   289  		if isV1DataPage {
   290  			binary.LittleEndian.PutUint32(w.repLevelSink.Bytes(), uint32(w.repEncoder.Len()))
   291  		}
   292  		repLevelsRLESize = int32(w.repLevelSink.Len())
   293  	}
   294  
   295  	uncompressed := defLevelsRLESize + repLevelsRLESize + int32(values.Len())
   296  	if isV1DataPage {
   297  		err = w.buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes())
   298  	} else {
   299  		err = w.buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes())
   300  	}
   301  
   302  	w.reset()
   303  	w.rowsWritten += w.numBufferedRows
   304  	w.numBufferedValues, w.numDataValues, w.numBufferedRows = 0, 0, 0
   305  	return err
   306  }
   307  
   308  func (w *columnWriter) buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error {
   309  	w.uncompressedData.Reset()
   310  	w.uncompressedData.Grow(int(uncompressed))
   311  	w.concatBuffers(defLevelsRLESize, repLevelsRLESize, values, &w.uncompressedData)
   312  
   313  	pageStats, err := w.getPageStatistics()
   314  	if err != nil {
   315  		return err
   316  	}
   317  	pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   318  	pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   319  	w.resetPageStatistics()
   320  
   321  	var data []byte
   322  	if w.pager.HasCompressor() {
   323  		w.compressedTemp.Reset()
   324  		data = w.pager.Compress(w.compressedTemp, w.uncompressedData.Bytes())
   325  	} else {
   326  		data = w.uncompressedData.Bytes()
   327  	}
   328  
   329  	// write the page to sink eagerly if there's no dictionary or if dictionary encoding has fallen back
   330  	if w.hasDict && !w.fallbackToNonDict {
   331  		pageSlice := make([]byte, len(data))
   332  		copy(pageSlice, data)
   333  		page := NewDataPageV1WithStats(memory.NewBufferBytes(pageSlice), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats)
   334  		w.totalCompressedBytes += int64(page.buf.Len()) // + size of Pageheader
   335  		w.pages = append(w.pages, page)
   336  	} else {
   337  		w.totalCompressedBytes += int64(len(data))
   338  		dp := NewDataPageV1WithStats(memory.NewBufferBytes(data), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats)
   339  		defer dp.Release()
   340  		return w.WriteDataPage(dp)
   341  	}
   342  	return nil
   343  }
   344  
   345  func (w *columnWriter) buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error {
   346  	var data []byte
   347  	if w.pager.HasCompressor() {
   348  		w.compressedTemp.Reset()
   349  		data = w.pager.Compress(w.compressedTemp, values)
   350  	} else {
   351  		data = values
   352  	}
   353  
   354  	// concatenate uncompressed levels and the possibly compressed values
   355  	var combined bytes.Buffer
   356  	combined.Grow(int(defLevelsRLESize + repLevelsRLESize + int32(len(data))))
   357  	w.concatBuffers(defLevelsRLESize, repLevelsRLESize, data, &combined)
   358  
   359  	pageStats, err := w.getPageStatistics()
   360  	if err != nil {
   361  		return err
   362  	}
   363  	pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   364  	pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   365  	w.resetPageStatistics()
   366  
   367  	numValues := int32(w.numBufferedValues)
   368  	numRows := int32(w.numBufferedRows)
   369  	nullCount := int32(pageStats.NullCount)
   370  	defLevelsByteLen := int32(defLevelsRLESize)
   371  	repLevelsByteLen := int32(repLevelsRLESize)
   372  
   373  	page := NewDataPageV2WithStats(memory.NewBufferBytes(combined.Bytes()), numValues, nullCount, numRows, w.encoding,
   374  		defLevelsByteLen, repLevelsByteLen, uncompressed, w.pager.HasCompressor(), pageStats)
   375  	if w.hasDict && !w.fallbackToNonDict {
   376  		w.totalCompressedBytes += int64(page.buf.Len()) // + sizeof pageheader
   377  		w.pages = append(w.pages, page)
   378  	} else {
   379  		w.totalCompressedBytes += int64(combined.Len())
   380  		defer page.Release()
   381  		return w.WriteDataPage(page)
   382  	}
   383  	return nil
   384  }
   385  
   386  func (w *columnWriter) FlushBufferedDataPages() (err error) {
   387  	if w.numBufferedValues > 0 {
   388  		if err = w.FlushCurrentPage(); err != nil {
   389  			return err
   390  		}
   391  	}
   392  
   393  	for _, p := range w.pages {
   394  		defer p.Release()
   395  		if err = w.WriteDataPage(p); err != nil {
   396  			return err
   397  		}
   398  	}
   399  	w.pages = w.pages[:0]
   400  	w.totalCompressedBytes = 0
   401  	return
   402  }
   403  
   404  func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels []int16) int64 {
   405  	toWrite := int64(0)
   406  	// if the field is required and non-repeated, no definition levels
   407  	if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 {
   408  		for _, v := range defLevels[:numValues] {
   409  			if v == w.descr.MaxDefinitionLevel() {
   410  				toWrite++
   411  			}
   412  		}
   413  		w.WriteDefinitionLevels(defLevels[:numValues])
   414  	} else {
   415  		toWrite = numValues
   416  	}
   417  
   418  	if repLevels != nil && w.descr.MaxRepetitionLevel() > 0 {
   419  		// a row could include more than one value
   420  		//count the occasions where we start a new row
   421  		for _, v := range repLevels[:numValues] {
   422  			if v == 0 {
   423  				w.numBufferedRows++
   424  			}
   425  		}
   426  
   427  		w.WriteRepetitionLevels(repLevels[:numValues])
   428  	} else {
   429  		// each value is exactly 1 row
   430  		w.numBufferedRows += int(numValues)
   431  	}
   432  	return toWrite
   433  }
   434  
   435  func (w *columnWriter) writeLevelsSpaced(numLevels int64, defLevels, repLevels []int16) {
   436  	if w.descr.MaxDefinitionLevel() > 0 {
   437  		w.WriteDefinitionLevels(defLevels[:numLevels])
   438  	}
   439  
   440  	if w.descr.MaxRepetitionLevel() > 0 {
   441  		for _, v := range repLevels {
   442  			if v == 0 {
   443  				w.numBufferedRows++
   444  			}
   445  		}
   446  		w.WriteRepetitionLevels(repLevels[:numLevels])
   447  	} else {
   448  		w.numBufferedRows += int(numLevels)
   449  	}
   450  }
   451  
   452  func (w *columnWriter) WriteDictionaryPage() error {
   453  	dictEncoder := w.currentEncoder.(encoding.DictEncoder)
   454  	buffer := memory.NewResizableBuffer(w.mem)
   455  	buffer.Resize(dictEncoder.DictEncodedSize())
   456  	dictEncoder.WriteDict(buffer.Bytes())
   457  	defer buffer.Release()
   458  
   459  	page := NewDictionaryPage(buffer, int32(dictEncoder.NumEntries()), w.props.DictionaryPageEncoding())
   460  	written, err := w.pager.WriteDictionaryPage(page)
   461  	w.totalBytesWritten += written
   462  	return err
   463  }
   464  
   465  type batchWriteInfo struct {
   466  	batchNum  int64
   467  	nullCount int64
   468  }
   469  
   470  func (b batchWriteInfo) numSpaced() int64 { return b.batchNum + b.nullCount }
   471  
   472  // this will always update the three output params
   473  // outValsToWrite, outSpacedValsToWrite, and NullCount. Additionally
   474  // it will update the validity bitmap if required (i.e. if at least one
   475  // level of nullable structs directly precede the leaf node)
   476  func (w *columnWriter) maybeCalculateValidityBits(defLevels []int16, batchSize int64) (out batchWriteInfo) {
   477  	if w.bitsBuffer == nil {
   478  		if w.levelInfo.DefLevel == 0 {
   479  			// in this case def levels should be null and we only
   480  			// need to output counts which will always be equal to
   481  			// the batch size passed in (max def level == 0 indicates
   482  			// there cannot be repeated or null fields)
   483  			out.batchNum = batchSize
   484  			out.nullCount = 0
   485  		} else {
   486  			var (
   487  				toWrite       int64
   488  				spacedToWrite int64
   489  			)
   490  			for i := int64(0); i < batchSize; i++ {
   491  				if defLevels[i] == w.levelInfo.DefLevel {
   492  					toWrite++
   493  				}
   494  				if defLevels[i] >= w.levelInfo.RepeatedAncestorDefLevel {
   495  					spacedToWrite++
   496  				}
   497  			}
   498  			out.batchNum += toWrite
   499  			out.nullCount = spacedToWrite - toWrite
   500  		}
   501  		return
   502  	}
   503  
   504  	// shrink to fit possible causes another allocation
   505  	newBitmapSize := bitutil.BytesForBits(batchSize)
   506  	if newBitmapSize != int64(w.bitsBuffer.Len()) {
   507  		w.bitsBuffer.ResizeNoShrink(int(newBitmapSize))
   508  	}
   509  
   510  	io := ValidityBitmapInputOutput{
   511  		ValidBits:      w.bitsBuffer.Bytes(),
   512  		ReadUpperBound: batchSize,
   513  	}
   514  	DefLevelsToBitmap(defLevels[:batchSize], w.levelInfo, &io)
   515  	out.batchNum = io.Read - io.NullCount
   516  	out.nullCount = io.NullCount
   517  	return
   518  }
   519  
   520  func (w *columnWriter) getPageStatistics() (enc metadata.EncodedStatistics, err error) {
   521  	if w.pageStatistics != nil {
   522  		enc, err = w.pageStatistics.Encode()
   523  	}
   524  	return
   525  }
   526  
   527  func (w *columnWriter) getChunkStatistics() (enc metadata.EncodedStatistics, err error) {
   528  	if w.chunkStatistics != nil {
   529  		enc, err = w.chunkStatistics.Encode()
   530  	}
   531  	return
   532  }
   533  
   534  func (w *columnWriter) resetPageStatistics() {
   535  	if w.chunkStatistics != nil {
   536  		w.chunkStatistics.Merge(w.pageStatistics)
   537  		w.pageStatistics.Reset()
   538  	}
   539  }
   540  
   541  func (w *columnWriter) Close() (err error) {
   542  	if !w.closed {
   543  		w.closed = true
   544  		if w.hasDict && !w.fallbackToNonDict {
   545  			w.WriteDictionaryPage()
   546  		}
   547  
   548  		if err = w.FlushBufferedDataPages(); err != nil {
   549  			return err
   550  		}
   551  
   552  		// ensure we release and reset everything even if we
   553  		// error out from the chunk statistics handling
   554  		defer func() {
   555  			w.defLevelSink.Reset(0)
   556  			w.repLevelSink.Reset(0)
   557  			if w.bitsBuffer != nil {
   558  				w.bitsBuffer.Release()
   559  				w.bitsBuffer = nil
   560  			}
   561  
   562  			w.currentEncoder.Release()
   563  			w.currentEncoder = nil
   564  		}()
   565  
   566  		var chunkStats metadata.EncodedStatistics
   567  		chunkStats, err = w.getChunkStatistics()
   568  		if err != nil {
   569  			return err
   570  		}
   571  
   572  		chunkStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   573  		chunkStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   574  
   575  		if w.rowsWritten > 0 && chunkStats.IsSet() {
   576  			w.metaData.SetStats(chunkStats)
   577  		}
   578  		err = w.pager.Close(w.hasDict, w.fallbackToNonDict)
   579  	}
   580  	return err
   581  }
   582  
   583  func (w *columnWriter) doBatches(total int64, repLevels []int16, action func(offset, batch int64)) {
   584  	batchSize := w.props.WriteBatchSize()
   585  	// if we're writing V1 data pages, have no replevels or the max replevel is 0 then just
   586  	// use the regular doBatches function
   587  	if w.props.DataPageVersion() == parquet.DataPageV1 || repLevels == nil || w.descr.MaxRepetitionLevel() == 0 {
   588  		doBatches(total, batchSize, action)
   589  		return
   590  	}
   591  
   592  	// if we get here that means we have repetition levels to write and we're writing
   593  	// V2 data pages. since we check whether to flush after each batch we write
   594  	// if we ensure all the batches begin and end on row boundaries we can avoid
   595  	// complex logic inside of our flushing or batch writing functions.
   596  	// the WriteBatch function recovers from panics so we can just panic here on a failure
   597  	// and it'll get caught by the WriteBatch functions above it
   598  	if int64(len(repLevels)) < total {
   599  		// if we're writing repLevels there has to be at least enough in the slice
   600  		// to write the total number that we're being asked to write
   601  		panic("columnwriter: not enough repetition levels for batch to write")
   602  	}
   603  
   604  	if repLevels[0] != 0 {
   605  		panic("columnwriter: batch writing for V2 data pages must start at a row boundary")
   606  	}
   607  
   608  	// loop by batchSize, but make sure we're ending/starting each batch on a row boundary
   609  	var (
   610  		batchStart, batch int64
   611  	)
   612  	for batchStart = 0; batchStart+batchSize < int64(len(repLevels)); batchStart += batch {
   613  		// check one past the last value of the batch for if it's a new row
   614  		// if it's not, shrink the batch and feel back to the beginning of a
   615  		// previous row boundary to end on
   616  		batch = batchSize
   617  		for ; repLevels[batchStart+batch] != 0; batch-- {
   618  		}
   619  		// batchStart <--> batch now begins and ends on a row boundary!
   620  		action(batchStart, batch)
   621  	}
   622  	action(batchStart, int64(len(repLevels))-batchStart)
   623  }
   624  
   625  func doBatches(total, batchSize int64, action func(offset, batch int64)) {
   626  	numBatches := total / batchSize
   627  	for i := int64(0); i < numBatches; i++ {
   628  		action(i*batchSize, batchSize)
   629  	}
   630  	if total%batchSize > 0 {
   631  		action(numBatches*batchSize, total%batchSize)
   632  	}
   633  }
   634  
   635  func levelSliceOrNil(rep []int16, offset, batch int64) []int16 {
   636  	if rep == nil {
   637  		return nil
   638  	}
   639  	return rep[offset : batch+offset]
   640  }
   641  
   642  //lint:ignore U1000 maybeReplaceValidity
   643  func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int64) arrow.Array {
   644  	if w.bitsBuffer == nil {
   645  		values.Retain()
   646  		return values
   647  	}
   648  
   649  	if len(values.Data().Buffers()) == 0 {
   650  		values.Retain()
   651  		return values
   652  	}
   653  
   654  	buffers := make([]*memory.Buffer, len(values.Data().Buffers()))
   655  	copy(buffers, values.Data().Buffers())
   656  	// bitsBuffer should already be the offset slice of the validity bits
   657  	// we want so we don't need to manually slice the validity buffer
   658  	buffers[0] = w.bitsBuffer
   659  
   660  	if values.Data().Offset() > 0 {
   661  		data := values.Data()
   662  		buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes])
   663  	}
   664  
   665  	data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0)
   666  	defer data.Release()
   667  	return array.MakeFromData(data)
   668  }