github.com/apache/arrow/go/v16@v16.1.0/parquet/file/column_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "io" 23 24 "github.com/apache/arrow/go/v16/arrow" 25 "github.com/apache/arrow/go/v16/arrow/array" 26 "github.com/apache/arrow/go/v16/arrow/bitutil" 27 "github.com/apache/arrow/go/v16/arrow/memory" 28 "github.com/apache/arrow/go/v16/parquet" 29 "github.com/apache/arrow/go/v16/parquet/internal/encoding" 30 "github.com/apache/arrow/go/v16/parquet/metadata" 31 "github.com/apache/arrow/go/v16/parquet/schema" 32 ) 33 34 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_writer_types.gen.go.tmpl 35 36 // ColumnChunkWriter is the base interface for all columnwriters. To directly write 37 // data to the column, you need to assert it to the correctly typed ColumnChunkWriter 38 // instance, such as Int32ColumnWriter. 39 type ColumnChunkWriter interface { 40 // Close ends this column and returns the number of bytes written 41 Close() error 42 // Type returns the underlying physical parquet type for this column 43 Type() parquet.Type 44 // Descr returns the column information for this writer 45 Descr() *schema.Column 46 // RowsWritten returns the number of rows that have so far been written with this writer 47 RowsWritten() int 48 // TotalCompressedBytes returns the number of bytes, after compression, that have been written so far 49 TotalCompressedBytes() int64 50 // TotalBytesWritten includes the bytes for writing dictionary pages, while TotalCompressedBytes is 51 // just the data and page headers 52 TotalBytesWritten() int64 53 // Properties returns the current WriterProperties in use for this writer 54 Properties() *parquet.WriterProperties 55 // CurrentEncoder returns the current encoder that is being used 56 // to encode new data written to this column 57 CurrentEncoder() encoding.TypedEncoder 58 // FallbackToPlain forces a dictionary encoded column writer to 59 // fallback to plain encoding, first flushing out any data it has 60 // and then changing the encoder to use plain encoding from 61 // here on out. 62 // 63 // This is automatically called if the dictionary reaches the 64 // limit in the write properties or under specific conditions. 65 // 66 // Has no effect if the column is not currently dictionary encoded. 67 FallbackToPlain() 68 // PageStatistics returns the current page statistics for this 69 // column writer. May be nil if stats are not enabled. 70 PageStatistics() metadata.TypedStatistics 71 // WriteDictIndices writes an arrow array of dictionary indices 72 // to this column. This should only be called by pqarrow or 73 // if you *really* know what you're doing. 74 WriteDictIndices(arrow.Array, []int16, []int16) error 75 76 LevelInfo() LevelInfo 77 SetBitsBuffer(*memory.Buffer) 78 HasBitsBuffer() bool 79 } 80 81 func computeLevelInfo(descr *schema.Column) (info LevelInfo) { 82 info.DefLevel = descr.MaxDefinitionLevel() 83 info.RepLevel = descr.MaxRepetitionLevel() 84 85 minSpacedDefLevel := descr.MaxDefinitionLevel() 86 n := descr.SchemaNode() 87 for n != nil && n.RepetitionType() != parquet.Repetitions.Repeated { 88 if n.RepetitionType() == parquet.Repetitions.Optional { 89 minSpacedDefLevel-- 90 } 91 n = n.Parent() 92 } 93 info.RepeatedAncestorDefLevel = minSpacedDefLevel 94 return 95 } 96 97 type columnWriter struct { 98 metaData *metadata.ColumnChunkMetaDataBuilder 99 descr *schema.Column 100 101 // scratch buffer if validity bits need to be recalculated 102 bitsBuffer *memory.Buffer 103 levelInfo LevelInfo 104 pager PageWriter 105 hasDict bool 106 encoding parquet.Encoding 107 props *parquet.WriterProperties 108 defEncoder encoding.LevelEncoder 109 repEncoder encoding.LevelEncoder 110 mem memory.Allocator 111 112 pageStatistics metadata.TypedStatistics 113 chunkStatistics metadata.TypedStatistics 114 115 // total number of values stored in the current data page. this is the maximum 116 // of the number of encoded def levels or encoded values. for 117 // non-repeated, required columns, this is equal to the number of encoded 118 // values. For repeated or optional values, there may be fewer data values 119 // than levels, and this tells you how many encoded levels there are in that case 120 numBufferedValues int64 121 122 // total number of rows stored in the current data page. This may be larger 123 // than numBufferedValues when writing a column with repeated values. This is 124 // the number of rows written since the last time we flushed a page. 125 numBufferedRows int 126 127 // the total number of stored values in the current page. for repeated or optional 128 // values. this number may be lower than numBuffered 129 numDataValues int64 130 131 rowsWritten int 132 totalBytesWritten int64 133 // records the current number of compressed bytes in a column 134 totalCompressedBytes int64 135 closed bool 136 fallbackToNonDict bool 137 138 pages []DataPage 139 140 defLevelSink *encoding.PooledBufferWriter 141 repLevelSink *encoding.PooledBufferWriter 142 143 uncompressedData bytes.Buffer 144 compressedTemp *bytes.Buffer 145 146 currentEncoder encoding.TypedEncoder 147 } 148 149 func newColumnWriterBase(metaData *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) columnWriter { 150 ret := columnWriter{ 151 metaData: metaData, 152 descr: metaData.Descr(), 153 levelInfo: computeLevelInfo(metaData.Descr()), 154 pager: pager, 155 hasDict: useDict, 156 encoding: enc, 157 props: props, 158 mem: props.Allocator(), 159 defLevelSink: encoding.NewPooledBufferWriter(0), 160 repLevelSink: encoding.NewPooledBufferWriter(0), 161 } 162 if pager.HasCompressor() { 163 ret.compressedTemp = new(bytes.Buffer) 164 } 165 if props.StatisticsEnabledFor(ret.descr.Path()) && ret.descr.SortOrder() != schema.SortUNKNOWN { 166 ret.pageStatistics = metadata.NewStatistics(ret.descr, props.Allocator()) 167 ret.chunkStatistics = metadata.NewStatistics(ret.descr, props.Allocator()) 168 } 169 170 ret.defEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxDefinitionLevel(), ret.defLevelSink) 171 ret.repEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxRepetitionLevel(), ret.repLevelSink) 172 173 ret.reset() 174 175 return ret 176 } 177 178 func (w *columnWriter) CurrentEncoder() encoding.TypedEncoder { return w.currentEncoder } 179 func (w *columnWriter) HasBitsBuffer() bool { return w.bitsBuffer != nil } 180 func (w *columnWriter) SetBitsBuffer(buf *memory.Buffer) { w.bitsBuffer = buf } 181 func (w *columnWriter) PageStatistics() metadata.TypedStatistics { return w.pageStatistics } 182 func (w *columnWriter) LevelInfo() LevelInfo { return w.levelInfo } 183 184 func (w *columnWriter) Type() parquet.Type { 185 return w.descr.PhysicalType() 186 } 187 188 func (w *columnWriter) Descr() *schema.Column { 189 return w.descr 190 } 191 192 func (w *columnWriter) Properties() *parquet.WriterProperties { 193 return w.props 194 } 195 196 func (w *columnWriter) TotalCompressedBytes() int64 { 197 return w.totalCompressedBytes 198 } 199 200 func (w *columnWriter) TotalBytesWritten() int64 { 201 bufferedPagesBytes := int64(0) 202 for _, p := range w.pages { 203 bufferedPagesBytes += int64(len(p.Data())) 204 } 205 206 return w.totalBytesWritten + bufferedPagesBytes 207 } 208 209 func (w *columnWriter) RowsWritten() int { 210 return w.rowsWritten + w.numBufferedRows 211 } 212 213 func (w *columnWriter) WriteDataPage(page DataPage) error { 214 written, err := w.pager.WriteDataPage(page) 215 w.totalBytesWritten += written 216 return err 217 } 218 219 func (w *columnWriter) WriteDefinitionLevels(levels []int16) { 220 w.defEncoder.EncodeNoFlush(levels) 221 } 222 223 func (w *columnWriter) WriteRepetitionLevels(levels []int16) { 224 w.repEncoder.EncodeNoFlush(levels) 225 } 226 227 func (w *columnWriter) reset() { 228 w.defLevelSink.Reset(0) 229 w.repLevelSink.Reset(0) 230 231 if w.props.DataPageVersion() == parquet.DataPageV1 { 232 // offset the buffers to make room to record the number of levels at the 233 // beginning of each after we've encoded them with RLE 234 if w.descr.MaxDefinitionLevel() > 0 { 235 w.defLevelSink.SetOffset(arrow.Uint32SizeBytes) 236 } 237 if w.descr.MaxRepetitionLevel() > 0 { 238 w.repLevelSink.SetOffset(arrow.Uint32SizeBytes) 239 } 240 } 241 242 w.defEncoder.Reset(w.descr.MaxDefinitionLevel()) 243 w.repEncoder.Reset(w.descr.MaxRepetitionLevel()) 244 } 245 246 func (w *columnWriter) concatBuffers(defLevelsSize, repLevelsSize int32, values []byte, wr io.Writer) { 247 wr.Write(w.repLevelSink.Bytes()[:repLevelsSize]) 248 wr.Write(w.defLevelSink.Bytes()[:defLevelsSize]) 249 wr.Write(values) 250 } 251 252 func (w *columnWriter) EstimatedBufferedValueBytes() int64 { 253 return w.currentEncoder.EstimatedDataEncodedSize() 254 } 255 256 func (w *columnWriter) commitWriteAndCheckPageLimit(numLevels, numValues int64) error { 257 w.numBufferedValues += numLevels 258 w.numDataValues += numValues 259 260 enc := w.currentEncoder.EstimatedDataEncodedSize() 261 if enc >= w.props.DataPageSize() { 262 return w.FlushCurrentPage() 263 } 264 return nil 265 } 266 267 func (w *columnWriter) FlushCurrentPage() error { 268 var ( 269 defLevelsRLESize int32 = 0 270 repLevelsRLESize int32 = 0 271 ) 272 273 values, err := w.currentEncoder.FlushValues() 274 if err != nil { 275 return err 276 } 277 defer values.Release() 278 279 isV1DataPage := w.props.DataPageVersion() == parquet.DataPageV1 280 if w.descr.MaxDefinitionLevel() > 0 { 281 w.defEncoder.Flush() 282 w.defLevelSink.SetOffset(0) 283 sz := w.defEncoder.Len() 284 if isV1DataPage { 285 sz += arrow.Uint32SizeBytes 286 binary.LittleEndian.PutUint32(w.defLevelSink.Bytes(), uint32(w.defEncoder.Len())) 287 } 288 defLevelsRLESize = int32(sz) 289 } 290 291 if w.descr.MaxRepetitionLevel() > 0 { 292 w.repEncoder.Flush() 293 w.repLevelSink.SetOffset(0) 294 if isV1DataPage { 295 binary.LittleEndian.PutUint32(w.repLevelSink.Bytes(), uint32(w.repEncoder.Len())) 296 } 297 repLevelsRLESize = int32(w.repLevelSink.Len()) 298 } 299 300 uncompressed := defLevelsRLESize + repLevelsRLESize + int32(values.Len()) 301 if isV1DataPage { 302 err = w.buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes()) 303 } else { 304 err = w.buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes()) 305 } 306 307 w.reset() 308 w.rowsWritten += w.numBufferedRows 309 w.numBufferedValues, w.numDataValues, w.numBufferedRows = 0, 0, 0 310 return err 311 } 312 313 func (w *columnWriter) buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error { 314 w.uncompressedData.Reset() 315 w.uncompressedData.Grow(int(uncompressed)) 316 w.concatBuffers(defLevelsRLESize, repLevelsRLESize, values, &w.uncompressedData) 317 318 pageStats, err := w.getPageStatistics() 319 if err != nil { 320 return err 321 } 322 pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path()))) 323 pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder() 324 w.resetPageStatistics() 325 326 var data []byte 327 if w.pager.HasCompressor() { 328 w.compressedTemp.Reset() 329 data = w.pager.Compress(w.compressedTemp, w.uncompressedData.Bytes()) 330 } else { 331 data = w.uncompressedData.Bytes() 332 } 333 334 // write the page to sink eagerly if there's no dictionary or if dictionary encoding has fallen back 335 if w.hasDict && !w.fallbackToNonDict { 336 pageSlice := make([]byte, len(data)) 337 copy(pageSlice, data) 338 page := NewDataPageV1WithStats(memory.NewBufferBytes(pageSlice), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats) 339 w.totalCompressedBytes += int64(page.buf.Len()) // + size of Pageheader 340 w.pages = append(w.pages, page) 341 } else { 342 w.totalCompressedBytes += int64(len(data)) 343 dp := NewDataPageV1WithStats(memory.NewBufferBytes(data), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats) 344 defer dp.Release() 345 return w.WriteDataPage(dp) 346 } 347 return nil 348 } 349 350 func (w *columnWriter) buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error { 351 var data []byte 352 if w.pager.HasCompressor() { 353 w.compressedTemp.Reset() 354 data = w.pager.Compress(w.compressedTemp, values) 355 } else { 356 data = values 357 } 358 359 // concatenate uncompressed levels and the possibly compressed values 360 var combined bytes.Buffer 361 combined.Grow(int(defLevelsRLESize + repLevelsRLESize + int32(len(data)))) 362 w.concatBuffers(defLevelsRLESize, repLevelsRLESize, data, &combined) 363 364 pageStats, err := w.getPageStatistics() 365 if err != nil { 366 return err 367 } 368 pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path()))) 369 pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder() 370 w.resetPageStatistics() 371 372 numValues := int32(w.numBufferedValues) 373 numRows := int32(w.numBufferedRows) 374 nullCount := int32(pageStats.NullCount) 375 defLevelsByteLen := int32(defLevelsRLESize) 376 repLevelsByteLen := int32(repLevelsRLESize) 377 378 page := NewDataPageV2WithStats(memory.NewBufferBytes(combined.Bytes()), numValues, nullCount, numRows, w.encoding, 379 defLevelsByteLen, repLevelsByteLen, uncompressed, w.pager.HasCompressor(), pageStats) 380 if w.hasDict && !w.fallbackToNonDict { 381 w.totalCompressedBytes += int64(page.buf.Len()) // + sizeof pageheader 382 w.pages = append(w.pages, page) 383 } else { 384 w.totalCompressedBytes += int64(combined.Len()) 385 defer page.Release() 386 return w.WriteDataPage(page) 387 } 388 return nil 389 } 390 391 func (w *columnWriter) FlushBufferedDataPages() (err error) { 392 if w.numBufferedValues > 0 { 393 if err = w.FlushCurrentPage(); err != nil { 394 return err 395 } 396 } 397 398 for _, p := range w.pages { 399 defer p.Release() 400 if err = w.WriteDataPage(p); err != nil { 401 return err 402 } 403 } 404 w.pages = w.pages[:0] 405 return 406 } 407 408 func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels []int16) int64 { 409 toWrite := int64(0) 410 // if the field is required and non-repeated, no definition levels 411 if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 { 412 for _, v := range defLevels[:numValues] { 413 if v == w.descr.MaxDefinitionLevel() { 414 toWrite++ 415 } 416 } 417 w.WriteDefinitionLevels(defLevels[:numValues]) 418 } else { 419 toWrite = numValues 420 } 421 422 if repLevels != nil && w.descr.MaxRepetitionLevel() > 0 { 423 // a row could include more than one value 424 //count the occasions where we start a new row 425 for _, v := range repLevels[:numValues] { 426 if v == 0 { 427 w.numBufferedRows++ 428 } 429 } 430 431 w.WriteRepetitionLevels(repLevels[:numValues]) 432 } else { 433 // each value is exactly 1 row 434 w.numBufferedRows += int(numValues) 435 } 436 return toWrite 437 } 438 439 func (w *columnWriter) writeLevelsSpaced(numLevels int64, defLevels, repLevels []int16) { 440 if w.descr.MaxDefinitionLevel() > 0 { 441 w.WriteDefinitionLevels(defLevels[:numLevels]) 442 } 443 444 if w.descr.MaxRepetitionLevel() > 0 { 445 for _, v := range repLevels { 446 if v == 0 { 447 w.numBufferedRows++ 448 } 449 } 450 w.WriteRepetitionLevels(repLevels[:numLevels]) 451 } else { 452 w.numBufferedRows += int(numLevels) 453 } 454 } 455 456 func (w *columnWriter) WriteDictionaryPage() error { 457 dictEncoder := w.currentEncoder.(encoding.DictEncoder) 458 buffer := memory.NewResizableBuffer(w.mem) 459 buffer.Resize(dictEncoder.DictEncodedSize()) 460 dictEncoder.WriteDict(buffer.Bytes()) 461 defer buffer.Release() 462 463 page := NewDictionaryPage(buffer, int32(dictEncoder.NumEntries()), w.props.DictionaryPageEncoding()) 464 written, err := w.pager.WriteDictionaryPage(page) 465 w.totalBytesWritten += written 466 return err 467 } 468 469 type batchWriteInfo struct { 470 batchNum int64 471 nullCount int64 472 } 473 474 func (b batchWriteInfo) numSpaced() int64 { return b.batchNum + b.nullCount } 475 476 // this will always update the three output params 477 // outValsToWrite, outSpacedValsToWrite, and NullCount. Additionally 478 // it will update the validity bitmap if required (i.e. if at least one 479 // level of nullable structs directly precede the leaf node) 480 func (w *columnWriter) maybeCalculateValidityBits(defLevels []int16, batchSize int64) (out batchWriteInfo) { 481 if w.bitsBuffer == nil { 482 if w.levelInfo.DefLevel == 0 { 483 // in this case def levels should be null and we only 484 // need to output counts which will always be equal to 485 // the batch size passed in (max def level == 0 indicates 486 // there cannot be repeated or null fields) 487 out.batchNum = batchSize 488 out.nullCount = 0 489 } else { 490 var ( 491 toWrite int64 492 spacedToWrite int64 493 ) 494 for i := int64(0); i < batchSize; i++ { 495 if defLevels[i] == w.levelInfo.DefLevel { 496 toWrite++ 497 } 498 if defLevels[i] >= w.levelInfo.RepeatedAncestorDefLevel { 499 spacedToWrite++ 500 } 501 } 502 out.batchNum += toWrite 503 out.nullCount = spacedToWrite - toWrite 504 } 505 return 506 } 507 508 // shrink to fit possible causes another allocation 509 newBitmapSize := bitutil.BytesForBits(batchSize) 510 if newBitmapSize != int64(w.bitsBuffer.Len()) { 511 w.bitsBuffer.ResizeNoShrink(int(newBitmapSize)) 512 } 513 514 io := ValidityBitmapInputOutput{ 515 ValidBits: w.bitsBuffer.Bytes(), 516 ReadUpperBound: batchSize, 517 } 518 DefLevelsToBitmap(defLevels[:batchSize], w.levelInfo, &io) 519 out.batchNum = io.Read - io.NullCount 520 out.nullCount = io.NullCount 521 return 522 } 523 524 func (w *columnWriter) getPageStatistics() (enc metadata.EncodedStatistics, err error) { 525 if w.pageStatistics != nil { 526 enc, err = w.pageStatistics.Encode() 527 } 528 return 529 } 530 531 func (w *columnWriter) getChunkStatistics() (enc metadata.EncodedStatistics, err error) { 532 if w.chunkStatistics != nil { 533 enc, err = w.chunkStatistics.Encode() 534 } 535 return 536 } 537 538 func (w *columnWriter) resetPageStatistics() { 539 if w.chunkStatistics != nil { 540 w.chunkStatistics.Merge(w.pageStatistics) 541 w.pageStatistics.Reset() 542 } 543 } 544 545 func (w *columnWriter) Close() (err error) { 546 if !w.closed { 547 w.closed = true 548 if w.hasDict && !w.fallbackToNonDict { 549 if err = w.WriteDictionaryPage(); err != nil { 550 return err 551 } 552 } 553 554 if err = w.FlushBufferedDataPages(); err != nil { 555 return err 556 } 557 558 // ensure we release and reset everything even if we 559 // error out from the chunk statistics handling 560 defer func() { 561 w.defLevelSink.Reset(0) 562 w.repLevelSink.Reset(0) 563 if w.bitsBuffer != nil { 564 w.bitsBuffer.Release() 565 w.bitsBuffer = nil 566 } 567 568 w.currentEncoder.Release() 569 w.currentEncoder = nil 570 }() 571 572 var chunkStats metadata.EncodedStatistics 573 chunkStats, err = w.getChunkStatistics() 574 if err != nil { 575 return err 576 } 577 578 chunkStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path()))) 579 chunkStats.Signed = schema.SortSIGNED == w.descr.SortOrder() 580 581 if w.rowsWritten > 0 && chunkStats.IsSet() { 582 w.metaData.SetStats(chunkStats) 583 } 584 err = w.pager.Close(w.hasDict, w.fallbackToNonDict) 585 } 586 return err 587 } 588 589 func (w *columnWriter) doBatches(total int64, repLevels []int16, action func(offset, batch int64)) { 590 batchSize := w.props.WriteBatchSize() 591 // if we're writing V1 data pages, have no replevels or the max replevel is 0 then just 592 // use the regular doBatches function 593 if w.props.DataPageVersion() == parquet.DataPageV1 || repLevels == nil || w.descr.MaxRepetitionLevel() == 0 { 594 doBatches(total, batchSize, action) 595 return 596 } 597 598 // if we get here that means we have repetition levels to write and we're writing 599 // V2 data pages. since we check whether to flush after each batch we write 600 // if we ensure all the batches begin and end on row boundaries we can avoid 601 // complex logic inside of our flushing or batch writing functions. 602 // the WriteBatch function recovers from panics so we can just panic here on a failure 603 // and it'll get caught by the WriteBatch functions above it 604 if int64(len(repLevels)) < total { 605 // if we're writing repLevels there has to be at least enough in the slice 606 // to write the total number that we're being asked to write 607 panic("columnwriter: not enough repetition levels for batch to write") 608 } 609 610 if repLevels[0] != 0 { 611 panic("columnwriter: batch writing for V2 data pages must start at a row boundary") 612 } 613 614 // loop by batchSize, but make sure we're ending/starting each batch on a row boundary 615 var ( 616 batchStart, batch int64 617 ) 618 for batchStart = 0; batchStart+batchSize < int64(len(repLevels)); batchStart += batch { 619 // check one past the last value of the batch for if it's a new row 620 // if it's not, shrink the batch and feel back to the beginning of a 621 // previous row boundary to end on 622 batch = batchSize 623 for ; repLevels[batchStart+batch] != 0; batch-- { 624 } 625 // batchStart <--> batch now begins and ends on a row boundary! 626 action(batchStart, batch) 627 } 628 action(batchStart, int64(len(repLevels))-batchStart) 629 } 630 631 func doBatches(total, batchSize int64, action func(offset, batch int64)) { 632 numBatches := total / batchSize 633 for i := int64(0); i < numBatches; i++ { 634 action(i*batchSize, batchSize) 635 } 636 if total%batchSize > 0 { 637 action(numBatches*batchSize, total%batchSize) 638 } 639 } 640 641 func levelSliceOrNil(rep []int16, offset, batch int64) []int16 { 642 if rep == nil { 643 return nil 644 } 645 return rep[offset : batch+offset] 646 } 647 648 //lint:ignore U1000 maybeReplaceValidity 649 func (w *columnWriter) maybeReplaceValidity(values arrow.Array, newNullCount int64) arrow.Array { 650 if w.bitsBuffer == nil { 651 values.Retain() 652 return values 653 } 654 655 if len(values.Data().Buffers()) == 0 { 656 values.Retain() 657 return values 658 } 659 660 buffers := make([]*memory.Buffer, len(values.Data().Buffers())) 661 copy(buffers, values.Data().Buffers()) 662 // bitsBuffer should already be the offset slice of the validity bits 663 // we want so we don't need to manually slice the validity buffer 664 buffers[0] = w.bitsBuffer 665 666 if values.Data().Offset() > 0 { 667 data := values.Data() 668 elemSize := data.DataType().(arrow.FixedWidthDataType).Bytes() 669 start := data.Offset() * elemSize 670 end := start + data.Len()*elemSize 671 buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[start:end]) 672 } 673 674 data := array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0) 675 defer data.Release() 676 return array.MakeFromData(data) 677 }