github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_writer.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  package file
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"io"
    23  
    24  	"github.com/apache/arrow/go/v7/arrow"
    25  	"github.com/apache/arrow/go/v7/arrow/array"
    26  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    27  	"github.com/apache/arrow/go/v7/arrow/memory"
    28  	"github.com/apache/arrow/go/v7/parquet"
    29  	"github.com/apache/arrow/go/v7/parquet/internal/encoding"
    30  	"github.com/apache/arrow/go/v7/parquet/metadata"
    31  	"github.com/apache/arrow/go/v7/parquet/schema"
    32  )
    33  
    34  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_writer_types.gen.go.tmpl
    35  
    36  // ColumnChunkWriter is the base interface for all columnwriters. To directly write
    37  // data to the column, you need to assert it to the correctly typed ColumnChunkWriter
    38  // instance, such as Int32ColumnWriter.
    39  type ColumnChunkWriter interface {
    40  	// Close ends this column and returns the number of bytes written
    41  	Close() error
    42  	// Type returns the underlying physical parquet type for this column
    43  	Type() parquet.Type
    44  	// Descr returns the column information for this writer
    45  	Descr() *schema.Column
    46  	// RowsWritten returns the number of rows that have so far been written with this writer
    47  	RowsWritten() int
    48  	// TotalCompressedBytes returns the number of bytes, after compression, that have been written so far
    49  	TotalCompressedBytes() int64
    50  	// TotalBytesWritten includes the bytes for writing dictionary pages, while TotalCompressedBytes is
    51  	// just the data and page headers
    52  	TotalBytesWritten() int64
    53  	// Properties returns the current WriterProperties in use for this writer
    54  	Properties() *parquet.WriterProperties
    55  
    56  	LevelInfo() LevelInfo
    57  	SetBitsBuffer(*memory.Buffer)
    58  }
    59  
    60  func computeLevelInfo(descr *schema.Column) (info LevelInfo) {
    61  	info.DefLevel = descr.MaxDefinitionLevel()
    62  	info.RepLevel = descr.MaxRepetitionLevel()
    63  
    64  	minSpacedDefLevel := descr.MaxDefinitionLevel()
    65  	n := descr.SchemaNode()
    66  	for n != nil && n.RepetitionType() != parquet.Repetitions.Repeated {
    67  		if n.RepetitionType() == parquet.Repetitions.Optional {
    68  			minSpacedDefLevel--
    69  		}
    70  		n = n.Parent()
    71  	}
    72  	info.RepeatedAncestorDefLevel = minSpacedDefLevel
    73  	return
    74  }
    75  
    76  type columnWriter struct {
    77  	metaData *metadata.ColumnChunkMetaDataBuilder
    78  	descr    *schema.Column
    79  
    80  	// scratch buffer if validity bits need to be recalculated
    81  	bitsBuffer *memory.Buffer
    82  	levelInfo  LevelInfo
    83  	pager      PageWriter
    84  	hasDict    bool
    85  	encoding   parquet.Encoding
    86  	props      *parquet.WriterProperties
    87  	defEncoder encoding.LevelEncoder
    88  	repEncoder encoding.LevelEncoder
    89  	mem        memory.Allocator
    90  
    91  	pageStatistics  metadata.TypedStatistics
    92  	chunkStatistics metadata.TypedStatistics
    93  
    94  	// total number of values stored in the current data page. this is the maximum
    95  	// of the number of encoded def levels or encoded values. for
    96  	// non-repeated, required columns, this is equal to the number of encoded
    97  	// values. For repeated or optional values, there may be fewer data values
    98  	// than levels, and this tells you how many encoded levels there are in that case
    99  	numBufferedValues int64
   100  
   101  	// total number of rows stored in the current data page. This may be larger
   102  	// than numBufferedValues when writing a column with repeated values. This is
   103  	// the number of rows written since the last time we flushed a page.
   104  	numBufferedRows int
   105  
   106  	// the total number of stored values in the current page. for repeated or optional
   107  	// values. this number may be lower than numBuffered
   108  	numDataValues int64
   109  
   110  	rowsWritten       int
   111  	totalBytesWritten int64
   112  	// records the current number of compressed bytes in a column
   113  	totalCompressedBytes int64
   114  	closed               bool
   115  	fallbackToNonDict    bool
   116  
   117  	pages []DataPage
   118  
   119  	defLevelSink *encoding.PooledBufferWriter
   120  	repLevelSink *encoding.PooledBufferWriter
   121  
   122  	uncompressedData bytes.Buffer
   123  	compressedTemp   *bytes.Buffer
   124  
   125  	currentEncoder encoding.TypedEncoder
   126  }
   127  
   128  func newColumnWriterBase(metaData *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) columnWriter {
   129  	ret := columnWriter{
   130  		metaData:     metaData,
   131  		descr:        metaData.Descr(),
   132  		levelInfo:    computeLevelInfo(metaData.Descr()),
   133  		pager:        pager,
   134  		hasDict:      useDict,
   135  		encoding:     enc,
   136  		props:        props,
   137  		mem:          props.Allocator(),
   138  		defLevelSink: encoding.NewPooledBufferWriter(0),
   139  		repLevelSink: encoding.NewPooledBufferWriter(0),
   140  	}
   141  	if pager.HasCompressor() {
   142  		ret.compressedTemp = new(bytes.Buffer)
   143  	}
   144  	if props.StatisticsEnabledFor(ret.descr.Path()) && ret.descr.SortOrder() != schema.SortUNKNOWN {
   145  		ret.pageStatistics = metadata.NewStatistics(ret.descr, props.Allocator())
   146  		ret.chunkStatistics = metadata.NewStatistics(ret.descr, props.Allocator())
   147  	}
   148  
   149  	ret.defEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxDefinitionLevel(), ret.defLevelSink)
   150  	ret.repEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxRepetitionLevel(), ret.repLevelSink)
   151  
   152  	ret.reset()
   153  
   154  	return ret
   155  }
   156  
   157  func (w *columnWriter) SetBitsBuffer(buf *memory.Buffer) { w.bitsBuffer = buf }
   158  
   159  func (w *columnWriter) LevelInfo() LevelInfo { return w.levelInfo }
   160  
   161  func (w *columnWriter) Type() parquet.Type {
   162  	return w.descr.PhysicalType()
   163  }
   164  
   165  func (w *columnWriter) Descr() *schema.Column {
   166  	return w.descr
   167  }
   168  
   169  func (w *columnWriter) Properties() *parquet.WriterProperties {
   170  	return w.props
   171  }
   172  
   173  func (w *columnWriter) TotalCompressedBytes() int64 {
   174  	return w.totalCompressedBytes
   175  }
   176  
   177  func (w *columnWriter) TotalBytesWritten() int64 {
   178  	return w.totalBytesWritten
   179  }
   180  
   181  func (w *columnWriter) RowsWritten() int {
   182  	return w.rowsWritten + w.numBufferedRows
   183  }
   184  
   185  func (w *columnWriter) WriteDataPage(page DataPage) error {
   186  	written, err := w.pager.WriteDataPage(page)
   187  	w.totalBytesWritten += written
   188  	return err
   189  }
   190  
   191  func (w *columnWriter) WriteDefinitionLevels(levels []int16) {
   192  	w.defEncoder.EncodeNoFlush(levels)
   193  }
   194  
   195  func (w *columnWriter) WriteRepetitionLevels(levels []int16) {
   196  	w.repEncoder.EncodeNoFlush(levels)
   197  }
   198  
   199  func (w *columnWriter) reset() {
   200  	w.defLevelSink.Reset(0)
   201  	w.repLevelSink.Reset(0)
   202  
   203  	if w.props.DataPageVersion() == parquet.DataPageV1 {
   204  		// offset the buffers to make room to record the number of levels at the
   205  		// beginning of each after we've encoded them with RLE
   206  		if w.descr.MaxDefinitionLevel() > 0 {
   207  			w.defLevelSink.SetOffset(arrow.Uint32SizeBytes)
   208  		}
   209  		if w.descr.MaxRepetitionLevel() > 0 {
   210  			w.repLevelSink.SetOffset(arrow.Uint32SizeBytes)
   211  		}
   212  	}
   213  
   214  	w.defEncoder.Reset(w.descr.MaxDefinitionLevel())
   215  	w.repEncoder.Reset(w.descr.MaxRepetitionLevel())
   216  }
   217  
   218  func (w *columnWriter) concatBuffers(defLevelsSize, repLevelsSize int32, values []byte, wr io.Writer) {
   219  	wr.Write(w.repLevelSink.Bytes()[:repLevelsSize])
   220  	wr.Write(w.defLevelSink.Bytes()[:defLevelsSize])
   221  	wr.Write(values)
   222  }
   223  
   224  func (w *columnWriter) EstimatedBufferedValueBytes() int64 {
   225  	return w.currentEncoder.EstimatedDataEncodedSize()
   226  }
   227  
   228  func (w *columnWriter) commitWriteAndCheckPageLimit(numLevels, numValues int64) error {
   229  	w.numBufferedValues += numLevels
   230  	w.numDataValues += numValues
   231  
   232  	if w.currentEncoder.EstimatedDataEncodedSize() >= w.props.DataPageSize() {
   233  		return w.FlushCurrentPage()
   234  	}
   235  	return nil
   236  }
   237  
   238  func (w *columnWriter) FlushCurrentPage() error {
   239  	var (
   240  		defLevelsRLESize int32 = 0
   241  		repLevelsRLESize int32 = 0
   242  	)
   243  
   244  	values, err := w.currentEncoder.FlushValues()
   245  	if err != nil {
   246  		return err
   247  	}
   248  	defer values.Release()
   249  
   250  	isV1DataPage := w.props.DataPageVersion() == parquet.DataPageV1
   251  	if w.descr.MaxDefinitionLevel() > 0 {
   252  		w.defEncoder.Flush()
   253  		w.defLevelSink.SetOffset(0)
   254  		sz := w.defEncoder.Len()
   255  		if isV1DataPage {
   256  			sz += arrow.Uint32SizeBytes
   257  			binary.LittleEndian.PutUint32(w.defLevelSink.Bytes(), uint32(w.defEncoder.Len()))
   258  		}
   259  		defLevelsRLESize = int32(sz)
   260  	}
   261  
   262  	if w.descr.MaxRepetitionLevel() > 0 {
   263  		w.repEncoder.Flush()
   264  		w.repLevelSink.SetOffset(0)
   265  		if isV1DataPage {
   266  			binary.LittleEndian.PutUint32(w.repLevelSink.Bytes(), uint32(w.repEncoder.Len()))
   267  		}
   268  		repLevelsRLESize = int32(w.repLevelSink.Len())
   269  	}
   270  
   271  	uncompressed := defLevelsRLESize + repLevelsRLESize + int32(values.Len())
   272  	if isV1DataPage {
   273  		w.buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes())
   274  	} else {
   275  		w.buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes())
   276  	}
   277  
   278  	w.reset()
   279  	w.rowsWritten += w.numBufferedRows
   280  	w.numBufferedValues, w.numDataValues, w.numBufferedRows = 0, 0, 0
   281  	return nil
   282  }
   283  
   284  func (w *columnWriter) buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error {
   285  	w.uncompressedData.Reset()
   286  	w.uncompressedData.Grow(int(uncompressed))
   287  	w.concatBuffers(defLevelsRLESize, repLevelsRLESize, values, &w.uncompressedData)
   288  
   289  	pageStats, err := w.getPageStatistics()
   290  	if err != nil {
   291  		return err
   292  	}
   293  	pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   294  	pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   295  	w.resetPageStatistics()
   296  
   297  	var data []byte
   298  	if w.pager.HasCompressor() {
   299  		w.compressedTemp.Reset()
   300  		data = w.pager.Compress(w.compressedTemp, w.uncompressedData.Bytes())
   301  	} else {
   302  		data = w.uncompressedData.Bytes()
   303  	}
   304  
   305  	// write the page to sink eagerly if there's no dictionary or if dictionary encoding has fallen back
   306  	if w.hasDict && !w.fallbackToNonDict {
   307  		pageSlice := make([]byte, len(data))
   308  		copy(pageSlice, data)
   309  		page := NewDataPageV1WithStats(memory.NewBufferBytes(pageSlice), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats)
   310  		w.totalCompressedBytes += int64(page.buf.Len()) // + size of Pageheader
   311  		w.pages = append(w.pages, page)
   312  	} else {
   313  		w.totalCompressedBytes += int64(len(data))
   314  		dp := NewDataPageV1WithStats(memory.NewBufferBytes(data), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats)
   315  		defer dp.Release()
   316  		w.WriteDataPage(dp)
   317  	}
   318  	return nil
   319  }
   320  
   321  func (w *columnWriter) buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error {
   322  	var data []byte
   323  	if w.pager.HasCompressor() {
   324  		w.compressedTemp.Reset()
   325  		data = w.pager.Compress(w.compressedTemp, values)
   326  	} else {
   327  		data = values
   328  	}
   329  
   330  	// concatenate uncompressed levels and the possibly compressed values
   331  	var combined bytes.Buffer
   332  	combined.Grow(int(defLevelsRLESize + repLevelsRLESize + int32(len(data))))
   333  	w.concatBuffers(defLevelsRLESize, repLevelsRLESize, data, &combined)
   334  
   335  	pageStats, err := w.getPageStatistics()
   336  	if err != nil {
   337  		return err
   338  	}
   339  	pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   340  	pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   341  	w.resetPageStatistics()
   342  
   343  	numValues := int32(w.numBufferedValues)
   344  	numRows := int32(w.numBufferedRows)
   345  	nullCount := int32(pageStats.NullCount)
   346  	defLevelsByteLen := int32(defLevelsRLESize)
   347  	repLevelsByteLen := int32(repLevelsRLESize)
   348  
   349  	page := NewDataPageV2WithStats(memory.NewBufferBytes(combined.Bytes()), numValues, nullCount, numRows, w.encoding,
   350  		defLevelsByteLen, repLevelsByteLen, uncompressed, w.pager.HasCompressor(), pageStats)
   351  	if w.hasDict && !w.fallbackToNonDict {
   352  		w.totalCompressedBytes += int64(page.buf.Len()) // + sizeof pageheader
   353  		w.pages = append(w.pages, page)
   354  	} else {
   355  		w.totalCompressedBytes += int64(combined.Len())
   356  		defer page.Release()
   357  		w.WriteDataPage(page)
   358  	}
   359  	return nil
   360  }
   361  
   362  func (w *columnWriter) FlushBufferedDataPages() {
   363  	if w.numBufferedValues > 0 {
   364  		w.FlushCurrentPage()
   365  	}
   366  
   367  	for _, p := range w.pages {
   368  		defer p.Release()
   369  		w.WriteDataPage(p)
   370  	}
   371  	w.pages = w.pages[:0]
   372  	w.totalCompressedBytes = 0
   373  }
   374  
   375  func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels []int16) int64 {
   376  	toWrite := int64(0)
   377  	// if the field is required and non-repeated, no definition levels
   378  	if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 {
   379  		for _, v := range defLevels[:numValues] {
   380  			if v == w.descr.MaxDefinitionLevel() {
   381  				toWrite++
   382  			}
   383  		}
   384  		w.WriteDefinitionLevels(defLevels[:numValues])
   385  	} else {
   386  		toWrite = numValues
   387  	}
   388  
   389  	if repLevels != nil && w.descr.MaxRepetitionLevel() > 0 {
   390  		// a row could include more than one value
   391  		//count the occasions where we start a new row
   392  		for _, v := range repLevels[:numValues] {
   393  			if v == 0 {
   394  				w.numBufferedRows++
   395  			}
   396  		}
   397  
   398  		w.WriteRepetitionLevels(repLevels[:numValues])
   399  	} else {
   400  		// each value is exactly 1 row
   401  		w.numBufferedRows += int(numValues)
   402  	}
   403  	return toWrite
   404  }
   405  
   406  func (w *columnWriter) writeLevelsSpaced(numLevels int64, defLevels, repLevels []int16) {
   407  	if w.descr.MaxDefinitionLevel() > 0 {
   408  		w.WriteDefinitionLevels(defLevels[:numLevels])
   409  	}
   410  
   411  	if w.descr.MaxRepetitionLevel() > 0 {
   412  		for _, v := range repLevels {
   413  			if v == 0 {
   414  				w.numBufferedRows++
   415  			}
   416  		}
   417  		w.WriteRepetitionLevels(repLevels[:numLevels])
   418  	} else {
   419  		w.numBufferedRows += int(numLevels)
   420  	}
   421  }
   422  
   423  func (w *columnWriter) WriteDictionaryPage() error {
   424  	dictEncoder := w.currentEncoder.(encoding.DictEncoder)
   425  	buffer := memory.NewResizableBuffer(w.mem)
   426  	buffer.Resize(dictEncoder.DictEncodedSize())
   427  	dictEncoder.WriteDict(buffer.Bytes())
   428  	defer buffer.Release()
   429  
   430  	page := NewDictionaryPage(buffer, int32(dictEncoder.NumEntries()), w.props.DictionaryPageEncoding())
   431  	written, err := w.pager.WriteDictionaryPage(page)
   432  	w.totalBytesWritten += written
   433  	return err
   434  }
   435  
   436  type batchWriteInfo struct {
   437  	batchNum  int64
   438  	nullCount int64
   439  }
   440  
   441  func (b batchWriteInfo) numSpaced() int64 { return b.batchNum + b.nullCount }
   442  
   443  // this will always update the three output params
   444  // outValsToWrite, outSpacedValsToWrite, and NullCount. Additionally
   445  // it will update the validity bitmap if required (i.e. if at least one
   446  // level of nullable structs directly precede the leaf node)
   447  func (w *columnWriter) maybeCalculateValidityBits(defLevels []int16, batchSize int64) (out batchWriteInfo) {
   448  	if w.bitsBuffer == nil {
   449  		if w.levelInfo.DefLevel == 0 {
   450  			// in this case def levels should be null and we only
   451  			// need to output counts which will always be equal to
   452  			// the batch size passed in (max def level == 0 indicates
   453  			// there cannot be repeated or null fields)
   454  			out.batchNum = batchSize
   455  			out.nullCount = 0
   456  		} else {
   457  			var (
   458  				toWrite       int64
   459  				spacedToWrite int64
   460  			)
   461  			for i := int64(0); i < batchSize; i++ {
   462  				if defLevels[i] == w.levelInfo.DefLevel {
   463  					toWrite++
   464  				}
   465  				if defLevels[i] >= w.levelInfo.RepeatedAncestorDefLevel {
   466  					spacedToWrite++
   467  				}
   468  			}
   469  			out.batchNum += toWrite
   470  			out.nullCount = spacedToWrite - toWrite
   471  		}
   472  		return
   473  	}
   474  
   475  	// shrink to fit possible causes another allocation
   476  	newBitmapSize := bitutil.BytesForBits(batchSize)
   477  	if newBitmapSize != int64(w.bitsBuffer.Len()) {
   478  		w.bitsBuffer.ResizeNoShrink(int(newBitmapSize))
   479  	}
   480  
   481  	io := ValidityBitmapInputOutput{
   482  		ValidBits:      w.bitsBuffer.Bytes(),
   483  		ReadUpperBound: batchSize,
   484  	}
   485  	DefLevelsToBitmap(defLevels[:batchSize], w.levelInfo, &io)
   486  	out.batchNum = io.Read - io.NullCount
   487  	out.nullCount = io.NullCount
   488  	return
   489  }
   490  
   491  func (w *columnWriter) getPageStatistics() (enc metadata.EncodedStatistics, err error) {
   492  	if w.pageStatistics != nil {
   493  		enc, err = w.pageStatistics.Encode()
   494  	}
   495  	return
   496  }
   497  
   498  func (w *columnWriter) getChunkStatistics() (enc metadata.EncodedStatistics, err error) {
   499  	if w.chunkStatistics != nil {
   500  		enc, err = w.chunkStatistics.Encode()
   501  	}
   502  	return
   503  }
   504  
   505  func (w *columnWriter) resetPageStatistics() {
   506  	if w.chunkStatistics != nil {
   507  		w.chunkStatistics.Merge(w.pageStatistics)
   508  		w.pageStatistics.Reset()
   509  	}
   510  }
   511  
   512  func (w *columnWriter) Close() (err error) {
   513  	if !w.closed {
   514  		w.closed = true
   515  		if w.hasDict && !w.fallbackToNonDict {
   516  			w.WriteDictionaryPage()
   517  		}
   518  
   519  		w.FlushBufferedDataPages()
   520  
   521  		var chunkStats metadata.EncodedStatistics
   522  		chunkStats, err = w.getChunkStatistics()
   523  		if err != nil {
   524  			return err
   525  		}
   526  
   527  		chunkStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path())))
   528  		chunkStats.Signed = schema.SortSIGNED == w.descr.SortOrder()
   529  
   530  		if w.rowsWritten > 0 && chunkStats.IsSet() {
   531  			w.metaData.SetStats(chunkStats)
   532  		}
   533  		err = w.pager.Close(w.hasDict, w.fallbackToNonDict)
   534  
   535  		w.defLevelSink.Reset(0)
   536  		w.repLevelSink.Reset(0)
   537  	}
   538  	return err
   539  }
   540  
   541  func (w *columnWriter) doBatches(total int64, repLevels []int16, action func(offset, batch int64)) {
   542  	batchSize := w.props.WriteBatchSize()
   543  	// if we're writing V1 data pages, have no replevels or the max replevel is 0 then just
   544  	// use the regular doBatches function
   545  	if w.props.DataPageVersion() == parquet.DataPageV1 || repLevels == nil || w.descr.MaxRepetitionLevel() == 0 {
   546  		doBatches(total, batchSize, action)
   547  		return
   548  	}
   549  
   550  	// if we get here that means we have repetition levels to write and we're writing
   551  	// V2 data pages. since we check whether to flush after each batch we write
   552  	// if we ensure all the batches begin and end on row boundaries we can avoid
   553  	// complex logic inside of our flushing or batch writing functions.
   554  	// the WriteBatch function recovers from panics so we can just panic here on a failure
   555  	// and it'll get caught by the WriteBatch functions above it
   556  	if int64(len(repLevels)) < total {
   557  		// if we're writing repLevels there has to be at least enough in the slice
   558  		// to write the total number that we're being asked to write
   559  		panic("columnwriter: not enough repetition levels for batch to write")
   560  	}
   561  
   562  	if repLevels[0] != 0 {
   563  		panic("columnwriter: batch writing for V2 data pages must start at a row boundary")
   564  	}
   565  
   566  	// loop by batchSize, but make sure we're ending/starting each batch on a row boundary
   567  	var (
   568  		batchStart, batch int64
   569  	)
   570  	for batchStart = 0; batchStart+batchSize < int64(len(repLevels)); batchStart += batch {
   571  		// check one past the last value of the batch for if it's a new row
   572  		// if it's not, shrink the batch and feel back to the beginning of a
   573  		// previous row boundary to end on
   574  		batch = batchSize
   575  		for ; repLevels[batchStart+batch] != 0; batch-- {
   576  		}
   577  		// batchStart <--> batch now begins and ends on a row boundary!
   578  		action(batchStart, batch)
   579  	}
   580  	action(batchStart, int64(len(repLevels))-batchStart)
   581  }
   582  
   583  func doBatches(total, batchSize int64, action func(offset, batch int64)) {
   584  	numBatches := total / batchSize
   585  	for i := int64(0); i < numBatches; i++ {
   586  		action(i*batchSize, batchSize)
   587  	}
   588  	if total%batchSize > 0 {
   589  		action(numBatches*batchSize, total%batchSize)
   590  	}
   591  }
   592  
   593  func levelSliceOrNil(rep []int16, offset, batch int64) []int16 {
   594  	if rep == nil {
   595  		return nil
   596  	}
   597  	return rep[offset : batch+offset]
   598  }
   599  
   600  func (w *ByteArrayColumnChunkWriter) maybeReplaceValidity(values array.Interface, newNullCount int64) array.Interface {
   601  	if w.bitsBuffer == nil {
   602  		return values
   603  	}
   604  
   605  	buffers := values.Data().Buffers()
   606  	if len(buffers) == 0 {
   607  		return values
   608  	}
   609  	// bitsBuffer should already be the offset slice of the validity bits
   610  	// we want so we don't need to manually slice the validity buffer
   611  	buffers[0] = w.bitsBuffer
   612  
   613  	if values.Data().Offset() > 0 {
   614  		data := values.Data()
   615  		buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes])
   616  	}
   617  	return array.MakeFromData(array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0))
   618  }