github.com/apache/arrow/go/v16@v16.1.0/parquet/file/column_writer.go

github.com/apache/arrow/go/v16@v16.1.0/parquet/file/column_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v16/arrow"
    25  	"github.com/apache/arrow/go/v16/arrow/array"
    26  	"github.com/apache/arrow/go/v16/arrow/bitutil"
    27  	"github.com/apache/arrow/go/v16/arrow/memory"
    28  	"github.com/apache/arrow/go/v16/parquet"
    29  	"github.com/apache/arrow/go/v16/parquet/internal/encoding"
    30  	"github.com/apache/arrow/go/v16/parquet/metadata"
    31  	"github.com/apache/arrow/go/v16/parquet/schema"
    32  )
    33  
    34  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_writer_types.gen.go.tmpl
    35  
    36  // ColumnChunkWriter is the base interface for all columnwriters. To directly write
    37  // data to the column, you need to assert it to the correctly typed ColumnChunkWriter
    38  // instance, such as Int32ColumnWriter.
    39  type ColumnChunkWriter interface {
    40  	// Close ends this column and returns the number of bytes written
    41  	Close() error
    42  	// Type returns the underlying physical parquet type for this column
    43  	Type() parquet.Type
    44  	// Descr returns the column information for this writer
    45  	Descr() *schema.Column
    46  	// RowsWritten returns the number of rows that have so far been written with this writer
    47  	RowsWritten() int
    48  	// TotalCompressedBytes returns the number of bytes, after compression, that have been written so far
    49  	TotalCompressedBytes() int64
    50  	// TotalBytesWritten includes the bytes for writing dictionary pages, while TotalCompressedBytes is
    51  	// just the data and page headers
    52  	TotalBytesWritten() int64
    53  	// Properties returns the current WriterProperties in use for this writer
    54  	Properties() *parquet.WriterProperties
    55  	// CurrentEncoder returns the current encoder that is being used
    56  	// to encode new data written to this column
    57  	CurrentEncoder() encoding.TypedEncoder
    58  	// FallbackToPlain forces a dictionary encoded column writer to
    59  	// fallback to plain encoding, first flushing out any data it has
    60  	// and then changing the encoder to use plain encoding from
    61  	// here on out.
    62  	//
    63  	// This is automatically called if the dictionary reaches the
    64  	// limit in the write properties or under specific conditions.
    65  	//
    66  	// Has no effect if the column is not currently dictionary encoded.
    67  	FallbackToPlain()
    68  	// PageStatistics returns the current page statistics for this
    69  	// column writer. May be nil if stats are not enabled.
    70  	PageStatistics() metadata.TypedStatistics
    71  	// WriteDictIndices writes an arrow array of dictionary indices
    72  	// to this column. This should only be called by pqarrow or
    73  	// if you *really* know what you're doing.
    74  	WriteDictIndices(arrow.Array, []int16, []int16) error
    75  
    76  	LevelInfo() LevelInfo
    77  	SetBitsBuffer(*memory.Buffer)
    78  	HasBitsBuffer() bool
    79  }
    80  
    81  func computeLevelInfo(descr *schema.Column) (info LevelInfo) {
    82  	info.DefLevel = descr.MaxDefinitionLevel()
    83  	info.RepLevel = descr.MaxRepetitionLevel()
    84  
    85  	minSpacedDefLevel := descr.MaxDefinitionLevel()
    86  	n := descr.SchemaNode()
    87  	for n != nil && n.RepetitionType() != parquet.Repetitions.Repeated {
    88  		if n.RepetitionType() == parquet.Repetitions.Optional {
    89  			minSpacedDefLevel--
    90  		}
    91  		n = n.Parent()
    92  	}
    93  	info.RepeatedAncestorDefLevel = minSpacedDefLevel
    94  	return
    95  }
    96  
    97  type columnWriter struct {
    98  	metaData *metadata.ColumnChunkMetaDataBuilder
    99  	descr    *schema.Column
   100  
   101  	// scratch buffer if validity bits need to be recalculated
   102  	bitsBuffer *memory.Buffer
   103  	levelInfo  LevelInfo
   104  	pager      PageWriter
   105  	hasDict    bool
   106  	encoding   parquet.Encoding
   107  	props      *parquet.WriterProperties
   108  	defEncoder encoding.LevelEncoder
   109  	repEncoder encoding.LevelEncoder
   110  	mem        memory.Allocator
   111  
   112  	pageStatistics  metadata.TypedStatistics
   113  	chunkStatistics metadata.TypedStatistics
   114  
   115  	// total number of values stored in the current data page. this is the maximum
   116  	// of the number of encoded def levels or encoded values. for
   117  	// non-repeated, required columns, this is equal to the number of encoded
   118  	// values. For repeated or optional values, there may be fewer data values
   119  	// than levels, and this tells you how many encoded levels there are in that case
   120  	numBufferedValues int64
   121  
   122  	// total number of rows stored in the current data page. This may be larger
   123  	// than numBufferedValues when writing a column with repeated values. This is
   124  	// the number of rows written since the last time we flushed a page.
   125  	numBufferedRows int
   126  
   127  	// the total number of stored values in the current page. for repeated or optional
   128  	// values. this number may be lower than numBuffered
   129  	numDataValues int64
   130  
   131  	rowsWritten       int
   132  	totalBytesWritten int64
   133  	// records the current number of compressed bytes in a column
   134  	totalCompressedBytes int64
   135  	closed               bool
   136  	fallbackToNonDict    bool
   137  
   138  	pages []DataPage
   139  
   140  	defLevelSink *encoding.PooledBufferWriter
   141  	repLevelSink *encoding.PooledBufferWriter
   142  
   143  	uncompressedData bytes.Buffer
   144  	compressedTemp   *bytes.Buffer
   145  
   146  	currentEncoder encoding.TypedEncoder
   147  }
   148  
   149  func newColumnWriterBase(metaData *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) columnWriter {
   150  	ret := columnWriter{
   151  		metaData:     metaData,
   152  		descr:        metaData.Descr(),
   153  		levelInfo:    computeLevelInfo(metaData.Descr()),
   154  		pager:        pager,
   155  		hasDict:      useDict,
   156  		encoding:     enc,
   157  		props:        props,
   158  		mem:          props.Allocator(),
   159  		defLevelSink: encoding.NewPooledBufferWriter(0),
   160  		repLevelSink: encoding.NewPooledBufferWriter(0),
   161  	}
   162  	if pager.HasCompressor() {
   163  		ret.compressedTemp = new(bytes.Buffer)
   164  	}
   165  	if props.StatisticsEnabledFor(ret.descr.Path()) && ret.descr.SortOrder() != schema.SortUNKNOWN {
   166  		ret.pageStatistics = metadata.NewStatistics(ret.descr, props.Allocator())
   167  		ret.chunkStatistics = metadata.NewStatistics(ret.descr, props.Allocator())
   168  	}
   169  
   170  	ret.defEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxDefinitionLevel(), ret.defLevelSink)
   171  	ret.repEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxRepetitionLevel(), ret.repLevelSink)
   172  
   173  	ret.reset()
   174  
   175  	return ret
   176  }
   177  
   178  func (w *columnWriter) CurrentEncoder() encoding.TypedEncoder    { return w.currentEncoder }
   179  func (w *columnWriter) HasBitsBuffer() bool                      { return w.bitsBuffer != nil }
   180  func (w *columnWriter) SetBitsBuffer(buf *memory.Buffer)         { w.bitsBuffer = buf }
   181  func (w *columnWriter) PageStatistics() metadata.TypedStatistics { return w.pageStatistics }
   182  func (w *columnWriter) LevelInfo() LevelInfo                     { return w.levelInfo }
   183  
   184  func (w *columnWriter) Type() parquet.Type {
   185  	return w.descr.PhysicalType()
   186  }
   187  
   188  func (w *columnWriter) Descr() *schema.Column {
   189  	return w.descr
   190  }
   191  
   192  func (w *columnWriter) Properties() *parquet.WriterProperties {
   193  	return w.props
   194  }
   195  
   196  func (w *columnWriter) TotalCompressedBytes() int64 {
   197  	return w.totalCompressedBytes
   198  }
   199  
   200  func (w *columnWriter) TotalBytesWritten() int64 {
   201  	bufferedPagesBytes := int64(0)
   202  	for _, p := range w.pages {
   203  		bufferedPagesBytes += int64(len(p.Data()))
   204  	}
   205  
   206  	return w.totalBytesWritten + bufferedPagesBytes
   207  }
   208  
   209  func (w *columnWriter) RowsWritten() int {
   210  	return w.rowsWritten + w.numBufferedRows
   211  }
   212  
   213  func (w *columnWriter) WriteDataPage(page DataPage) error {
   214  	written, err := w.pager.WriteDataPage(page)
   215  	w.totalBytesWritten += written
   216  	return err
   217  }
   218  
   219  func (w *columnWriter) WriteDefinitionLevels(levels []int16) {
   220  	w.defEncoder.EncodeNoFlush(levels)
   221  }
   222  
   223  func (w *columnWriter) WriteRepetitionLevels(levels []int16) {
   224  	w.repEncoder.EncodeNoFlush(levels)
   225  }
   226  
   227  func (w *columnWriter) reset() {
   228  	w.defLevelSink.Reset(0)
   229  	w.repLevelSink.Reset(0)
   230  
   231  	if w.props.DataPageVersion() == parquet.DataPageV1 {
   232  		// offset the buffers to make room to record the number of levels at the
   233  		// beginning of each after we've encoded them with RLE
   234  		if w.descr.MaxDefinitionLevel() > 0 {
   235  			w.defLevelSink.SetOffset(arrow.Uint32SizeBytes)
   236  		}
   237  		if w.descr.MaxRepetitionLevel() > 0 {
   238  			w.repLevelSink.SetOffset(arrow.Uint32SizeBytes)
   239  		}
   240  	}
   241  
   242  	w.defEncoder.Reset(w.descr.MaxDefinitionLevel())
   243  	w.repEncoder.Reset(w.descr.MaxRepetitionLevel())
   244  }
   245  
   246  func (w *columnWriter) concatBuffers(defLevelsSize, repLevelsSize int32, values []byte, wr io.Writer) {
   247  	wr.Write(w.repLevelSink.Bytes()[:repLevelsSize])
   248  	wr.Write(w.defLevelSink.Bytes()[:defLevelsSize])
   249  	wr.Write(values)
   250  }
   251  
   252  func (w *columnWriter) EstimatedBufferedValueBytes() int64 {
   253  	return w.currentEncoder.EstimatedDataEncodedSize()
   254  }
   255  
   256  func (w *columnWriter) commitWriteAndCheckPageLimit(numLevels, numValues int64) error {
   257  	w.numBufferedValues += numLevels
   258  	w.numDataValues += numValues
   259  
   260  	enc := w.currentEncoder.EstimatedDataEncodedSize()
   261  	if enc >= w.props.DataPageSize() {
   262  		return w.FlushCurrentPage()
   263  	}
   264  	return nil
   265  }
   266  
   267  func (w *columnWriter) FlushCurrentPage() error {
   268  	var (
   269  		defLevelsRLESize int32 = 0
   270  		repLevelsRLESize int32 = 0
   271  	)
   272  
   273  	values, err := w.currentEncoder.FlushValues()
   274  	if err != nil {
   275  		return err
   276  	}
   277  	defer values.Release()
   278  
   279  	isV1DataPage := w.props.DataPageVersion() == parquet.DataPageV1
   280  	if w.descr.MaxDefinitionLevel() > 0 {
   281  		w.defEncoder.Flush()
   282  		w.defLevelSink.SetOffset(0)
   283  		sz := w.defEncoder.Len()
   284  		if isV1DataPage {
   285  			sz += arrow.Uint32SizeBytes
   286  			binary.LittleEndian.PutUint32(w.defLevelSink.Bytes(), uint32(w.defEncoder.Len()))
   287  		}
   288  		defLevelsRLESize = int32(sz)
   289  	}
   290  
   291  	if w.descr.MaxRepetitionLevel() > 0 {
   292  		w.repEncoder.Flush()
   293  		w.repLevelSink.SetOffset(0)
   294  		if isV1DataPage {
   295  			binary.LittleEndian.PutUint32(w.repLevelSink.Bytes(), uint32(w.repEncoder.Len()))
   296  		}
   297  		repLevelsRLESize = int32(w.repLevelSink.Len())
   298  	}
   299  
   300  	uncompressed := defLevelsRLESize + repLevelsRLESize + int32(values.Len())
   301  	if isV1DataPage {
   302  		err = w.buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes())
   303  	} else {
   304  		err = w.buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes())
   305  	}
   306  
   307  	w.reset()
   308  	w.rowsWritten += w.numBufferedRows
   309  	w.numBufferedValues, w.numDataValues, w.numBufferedRows = 0, 0, 0
   310  	return err
   311  }
   312  
   313  func (w *columnWriter) buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error {
   314  	w.uncompressedData.Reset()
   315  	w.uncompressedData.Grow(int(uncompressed))
   316  	w.concatBuffers(defLevelsRLESize, repLevelsRLESize, values, &w.uncompressedData)
   317  
   318  	pageStats, err := w.getPageStatistics()
   319  	if err != nil {
   320  		return err
   321  	}
   322  	pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   323  	pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   324  	w.resetPageStatistics()
   325  
   326  	var data []byte
   327  	if w.pager.HasCompressor() {
   328  		w.compressedTemp.Reset()
   329  		data = w.pager.Compress(w.compressedTemp, w.uncompressedData.Bytes())
   330  	} else {
   331  		data = w.uncompressedData.Bytes()
   332  	}
   333  
   334  	// write the page to sink eagerly if there's no dictionary or if dictionary encoding has fallen back
   335  	if w.hasDict && !w.fallbackToNonDict {
   336  		pageSlice := make([]byte, len(data))
   337  		copy(pageSlice, data)
   338  		page := NewDataPageV1WithStats(memory.NewBufferBytes(pageSlice), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats)
   339  		w.totalCompressedBytes += int64(page.buf.Len()) // + size of Pageheader
   340  		w.pages = append(w.pages, page)
   341  	} else {
   342  		w.totalCompressedBytes += int64(len(data))
   343  		dp := NewDataPageV1WithStats(memory.NewBufferBytes(data), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats)
   344  		defer dp.Release()
   345  		return w.WriteDataPage(dp)
   346  	}
   347  	return nil
   348  }
   349  
   350  func (w *columnWriter) buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error {
   351  	var data []byte
   352  	if w.pager.HasCompressor() {
   353  		w.compressedTemp.Reset()
   354  		data = w.pager.Compress(w.compressedTemp, values)
   355  	} else {
   356  		data = values
   357  	}
   358  
   359  	// concatenate uncompressed levels and the possibly compressed values
   360  	var combined bytes.Buffer
   361  	combined.Grow(int(defLevelsRLESize + repLevelsRLESize + int32(len(data))))
   362  	w.concatBuffers(defLevelsRLESize, repLevelsRLESize, data, &combined)
   363  
   364  	pageStats, err := w.getPageStatistics()
   365  	if err != nil {
   366  		return err
   367  	}
   368  	pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   369  	pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   370  	w.resetPageStatistics()
   371  
   372  	numValues := int32(w.numBufferedValues)
   373  	numRows := int32(w.numBufferedRows)
   374  	nullCount := int32(pageStats.NullCount)
   375  	defLevelsByteLen := int32(defLevelsRLESize)
   376  	repLevelsByteLen := int32(repLevelsRLESize)
   377  
   378  	page := NewDataPageV2WithStats(memory.NewBufferBytes(combined.Bytes()), numValues, nullCount, numRows, w.encoding,
   379  		defLevelsByteLen, repLevelsByteLen, uncompressed, w.pager.HasCompressor(), pageStats)
   380  	if w.hasDict && !w.fallbackToNonDict {
   381  		w.totalCompressedBytes += int64(page.buf.Len()) // + sizeof pageheader
   382  		w.pages = append(w.pages, page)
   383  	} else {
   384  		w.totalCompressedBytes += int64(combined.Len())
   385  		defer page.Release()
   386  		return w.WriteDataPage(page)
   387  	}
   388  	return nil
   389  }
   390  
   391  func (w *columnWriter) FlushBufferedDataPages() (err error) {
   392  	if w.numBufferedValues > 0 {
   393  		if err = w.FlushCurrentPage(); err != nil {
   394  			return err
   395  		}
   396  	}
   397  
   398  	for _, p := range w.pages {
   399  		defer p.Release()
   400  		if err = w.WriteDataPage(p); err != nil {
   401  			return err
   402  		}
   403  	}
   404  	w.pages = w.pages[:0]
   405  	return
   406  }
   407  
   408  func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels []int16) int64 {
   409  	toWrite := int64(0)
   410  	// if the field is required and non-repeated, no definition levels
   411  	if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 {
   412  		for _, v := range defLevels[:numValues] {
   413  			if v == w.descr.MaxDefinitionLevel() {
   414  				toWrite++
   415  			}
   416  		}
   417  		w.WriteDefinitionLevels(defLevels[:numValues])
   418  	} else {
   419  		toWrite = numValues
   420  	}
   421  
   422  	if repLevels != nil && w.descr.MaxRepetitionLevel() > 0 {
   423  		// a row could include more than one value
   424  		//count the occasions where we start a new row
   425  		for _, v := range repLevels[:numValues] {
   426  			if v == 0 {
   427  				w.numBufferedRows++
   428  			}
   429  		}
   430  
   431  		w.WriteRepetitionLevels(repLevels[:numValues])
   432  	} else {
   433  		// each value is exactly 1 row
   434  		w.numBufferedRows += int(numValues)
   435  	}
   436  	return toWrite
   437  }
   438  
   439  func (w *columnWriter) writeLevelsSpaced(numLevels int64, defLevels, repLevels []int16) {
   440  	if w.descr.MaxDefinitionLevel() > 0 {
   441  		w.WriteDefinitionLevels(defLevels[:numLevels])
   442  	}
   443  
   444  	if w.descr.MaxRepetitionLevel() > 0 {
   445  		for _, v := range repLevels {
   446  			if v == 0 {
   447  				w.numBufferedRows++
   448  			}
   449  		}
   450  		w.WriteRepetitionLevels(repLevels[:numLevels])
   451  	} else {
   452  		w.numBufferedRows += int(numLevels)
   453  	}
   454  }
   455  
   456  func (w *columnWriter) WriteDictionaryPage() error {
   457  	dictEncoder := w.currentEncoder.(encoding.DictEncoder)
   458  	buffer := memory.NewResizableBuffer(w.mem)
   459  	buffer.Resize(dictEncoder.DictEncodedSize())
   460  	dictEncoder.WriteDict(buffer.Bytes())
   461  	defer buffer.Release()
   462  
   463  	page := NewDictionaryPage(buffer, int32(dictEncoder.NumEntries()), w.props.DictionaryPageEncoding())
   464  	written, err := w.pager.WriteDictionaryPage(page)
   465  	w.totalBytesWritten += written
   466  	return err
   467  }
   468  
   469  type batchWriteInfo struct {
   470  	batchNum  int64
   471  	nullCount int64
   472  }
   473  
   474  func (b batchWriteInfo) numSpaced() int64 { return b.batchNum + b.nullCount }
   475  
   476  // this will always update the three output params
   477  // outValsToWrite, outSpacedValsToWrite, and NullCount. Additionally
   478  // it will update the validity bitmap if required (i.e. if at least one
   479  // level of nullable structs directly precede the leaf node)
   480  func (w *columnWriter) maybeCalculateValidityBits(defLevels []int16, batchSize int64) (out batchWriteInfo) {
   481  	if w.bitsBuffer == nil {
   482  		if w.levelInfo.DefLevel == 0 {
   483  			// in this case def levels should be null and we only
   484  			// need to output counts which will always be equal to
   485  			// the batch size passed in (max def level == 0 indicates
   486  			// there cannot be repeated or null fields)
   487  			out.batchNum = batchSize
   488  			out.nullCount = 0
   489  		} else {
   490  			var (
   491  				toWrite       int64
   492  				spacedToWrite int64
   493  			)
   494  			for i := int64(0); i < batchSize; i++ {
   495  				if defLevels[i] == w.levelInfo.DefLevel {
   496  					toWrite++
   497  				}
   498  				if defLevels[i] >= w.levelInfo.RepeatedAncestorDefLevel {
   499  					spacedToWrite++
   500  				}
   501  			}
   502  			out.batchNum += toWrite
   503  			out.nullCount = spacedToWrite - toWrite
   504  		}
   505  		return
   506  	}
   507  
   508  	// shrink to fit possible causes another allocation
   509  	newBitmapSize := bitutil.BytesForBits(batchSize)
   510  	if newBitmapSize != int64(w.bitsBuffer.Len()) {
   511  		w.bitsBuffer.ResizeNoShrink(int(newBitmapSize))
   512  	}
   513  
   514  	io := ValidityBitmapInputOutput{
   515  		ValidBits:      w.bitsBuffer.Bytes(),
   516  		ReadUpperBound: batchSize,
   517  	}
   518  	DefLevelsToBitmap(defLevels[:batchSize], w.levelInfo, &io)
   519  	out.batchNum = io.Read - io.NullCount
   520  	out.nullCount = io.NullCount
   521  	return
   522  }
   523  
   524  func (w *columnWriter) getPageStatistics() (enc metadata.EncodedStatistics, err error) {
   525  	if w.pageStatistics != nil {
   526  		enc, err = w.pageStatistics.Encode()
   527  	}
   528  	return
   529  }
   530  
   531  func (w *columnWriter) getChunkStatistics() (enc metadata.EncodedStatistics, err error) {
   532  	if w.chunkStatistics != nil {
   533  		enc, err = w.chunkStatistics.Encode()
   534  	}
   535  	return
   536  }
   537  
   538  func (w *columnWriter) resetPageStatistics() {
   539  	if w.chunkStatistics != nil {
   540  		w.chunkStatistics.Merge(w.pageStatistics)
   541  		w.pageStatistics.Reset()
   542  	}
   543  }
   544  
   545  func (w *columnWriter) Close() (err error) {
   546  	if !w.closed {
   547  		w.closed = true
   548  		if w.hasDict && !w.fallbackToNonDict {
   549  			if err = w.WriteDictionaryPage(); err != nil {
   550  				return err
   551  			}
   552  		}
   553  
   554  		if err = w.FlushBufferedDataPages(); err != nil {
   555  			return err
   556  		}
   557  
   558  		// ensure we release and reset everything even if we
   559  		// error out from the chunk statistics handling
   560  		defer func() {
   561  			w.defLevelSink.Reset(0)
   562  			w.repLevelSink.Reset(0)
   563  			if w.bitsBuffer != nil {
   564  				w.bitsBuffer.Release()
   565  				w.bitsBuffer = nil
   566  			}
   567  
   568  			w.currentEncoder.Release()
   569  			w.currentEncoder = nil
   570  		}()
   571  
   572  		var chunkStats metadata.EncodedStatistics
   573  		chunkStats, err = w.getChunkStatistics()
   574  		if err != nil {
   575  			return err
   576  		}
   577  
   578  		chunkStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   579  		chunkStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   580  
   581  		if w.rowsWritten > 0 && chunkStats.IsSet() {
   582  			w.metaData.SetStats(chunkStats)
   583  		}
   584  		err = w.pager.Close(w.hasDict, w.fallbackToNonDict)
   585  	}
   586  	return err
   587  }
   588  
   589  func (w *columnWriter) doBatches(total int64, repLevels []int16, action func(offset, batch int64)) {
   590  	batchSize := w.props.WriteBatchSize()
   591  	// if we're writing V1 data pages, have no replevels or the max replevel is 0 then just
   592  	// use the regular doBatches function
   593  	if w.props.DataPageVersion() == parquet.DataPageV1 || repLevels == nil || w.descr.MaxRepetitionLevel() == 0 {
   594  		doBatches(total, batchSize, action)
   595  		return
   596  	}
   597  
   598  	// if we get here that means we have repetition levels to write and we're writing
   599  	// V2 data pages. since we check whether to flush after each batch we write
   600  	// if we ensure all the batches begin and end on row boundaries we can avoid
   601  	// complex logic inside of our flushing or batch writing functions.
   602  	// the WriteBatch function recovers from panics so we can just panic here on a failure
   603  	// and it'll get caught by the WriteBatch functions above it
   604  	if int64(len(repLevels)) < total {
   605  		// if we're writing repLevels there has to be at least enough in the slice
   606  		// to write the total number that we're being asked to write
   607  		panic("columnwriter: not enough repetition levels for batch to write")
   608  	}
   609  
   610  	if repLevels[0] != 0 {
   611  		panic("columnwriter: batch writing for V2 data pages must start at a row boundary")
   612  	}
   613  
   614  	// loop by batchSize, but make sure we're ending/starting each batch on a row boundary
   615  	var (
   616  		batchStart, batch int64
   617  	)
   618  	for batchStart = 0; batchStart+batchSize < int64(len(repLevels)); batchStart += batch {
   619  		// check one past the last value of the batch for if it's a new row
   620  		// if it's not, shrink the batch and feel back to the beginning of a
   621  		// previous row boundary to end on
   622  		batch = batchSize
   623  		for ; repLevels[batchStart+batch] != 0; batch-- {
   624  		}
   625  		// batchStart <--> batch now begins and ends on a row boundary!
   626  		action(batchStart, batch)
   627  	}
   628  	action(batchStart, int64(len(repLevels))-batchStart)
   629  }
   630  
   631  func doBatches(total, batchSize int64, action func(offset, batch int64)) {
   632  	numBatches := total / batchSize
   633  	for i := int64(0); i < numBatches; i++ {
   634  		action(i*batchSize, batchSize)
   635  	}
   636  	if total%batchSize > 0 {
   637  		action(numBatches*batchSize, total%batchSize)
   638  	}
   639  }
   640  
   641  func levelSliceOrNil(rep []int16, offset, batch int64) []int16 {
   642  	if rep == nil {
   643  		return nil
   644  	}
   645  	return rep[offset : batch+offset]
   646  }
   647  
   648  //lint:ignore U1000 maybeReplaceValidity
   649  func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int64) arrow.Array {
   650  	if w.bitsBuffer == nil {
   651  		values.Retain()
   652  		return values
   653  	}
   654  
   655  	if len(values.Data().Buffers()) == 0 {
   656  		values.Retain()
   657  		return values
   658  	}
   659  
   660  	buffers := make([]*memory.Buffer, len(values.Data().Buffers()))
   661  	copy(buffers, values.Data().Buffers())
   662  	// bitsBuffer should already be the offset slice of the validity bits
   663  	// we want so we don't need to manually slice the validity buffer
   664  	buffers[0] = w.bitsBuffer
   665  
   666  	if values.Data().Offset() > 0 {
   667  		data := values.Data()
   668  		elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes()
   669  		start := data.Offset() * elemSize
   670  		end := start + data.Len()*elemSize
   671  		buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end])
   672  	}
   673  
   674  	data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0)
   675  	defer data.Release()
   676  	return array.MakeFromData(data)
   677  }