github.com/apache/arrow/go/v7@v7.0.1/parquet/file/column_writer.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "io" 23 24 "github.com/apache/arrow/go/v7/arrow" 25 "github.com/apache/arrow/go/v7/arrow/array" 26 "github.com/apache/arrow/go/v7/arrow/bitutil" 27 "github.com/apache/arrow/go/v7/arrow/memory" 28 "github.com/apache/arrow/go/v7/parquet" 29 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 30 "github.com/apache/arrow/go/v7/parquet/metadata" 31 "github.com/apache/arrow/go/v7/parquet/schema" 32 ) 33 34 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=../internal/encoding/physical_types.tmpldata column_writer_types.gen.go.tmpl 35 36 // ColumnChunkWriter is the base interface for all columnwriters. To directly write 37 // data to the column, you need to assert it to the correctly typed ColumnChunkWriter 38 // instance, such as Int32ColumnWriter. 39 type ColumnChunkWriter interface { 40 // Close ends this column and returns the number of bytes written 41 Close() error 42 // Type returns the underlying physical parquet type for this column 43 Type() parquet.Type 44 // Descr returns the column information for this writer 45 Descr() *schema.Column 46 // RowsWritten returns the number of rows that have so far been written with this writer 47 RowsWritten() int 48 // TotalCompressedBytes returns the number of bytes, after compression, that have been written so far 49 TotalCompressedBytes() int64 50 // TotalBytesWritten includes the bytes for writing dictionary pages, while TotalCompressedBytes is 51 // just the data and page headers 52 TotalBytesWritten() int64 53 // Properties returns the current WriterProperties in use for this writer 54 Properties() *parquet.WriterProperties 55 56 LevelInfo() LevelInfo 57 SetBitsBuffer(*memory.Buffer) 58 } 59 60 func computeLevelInfo(descr *schema.Column) (info LevelInfo) { 61 info.DefLevel = descr.MaxDefinitionLevel() 62 info.RepLevel = descr.MaxRepetitionLevel() 63 64 minSpacedDefLevel := descr.MaxDefinitionLevel() 65 n := descr.SchemaNode() 66 for n != nil && n.RepetitionType() != parquet.Repetitions.Repeated { 67 if n.RepetitionType() == parquet.Repetitions.Optional { 68 minSpacedDefLevel-- 69 } 70 n = n.Parent() 71 } 72 info.RepeatedAncestorDefLevel = minSpacedDefLevel 73 return 74 } 75 76 type columnWriter struct { 77 metaData *metadata.ColumnChunkMetaDataBuilder 78 descr *schema.Column 79 80 // scratch buffer if validity bits need to be recalculated 81 bitsBuffer *memory.Buffer 82 levelInfo LevelInfo 83 pager PageWriter 84 hasDict bool 85 encoding parquet.Encoding 86 props *parquet.WriterProperties 87 defEncoder encoding.LevelEncoder 88 repEncoder encoding.LevelEncoder 89 mem memory.Allocator 90 91 pageStatistics metadata.TypedStatistics 92 chunkStatistics metadata.TypedStatistics 93 94 // total number of values stored in the current data page. this is the maximum 95 // of the number of encoded def levels or encoded values. for 96 // non-repeated, required columns, this is equal to the number of encoded 97 // values. For repeated or optional values, there may be fewer data values 98 // than levels, and this tells you how many encoded levels there are in that case 99 numBufferedValues int64 100 101 // total number of rows stored in the current data page. This may be larger 102 // than numBufferedValues when writing a column with repeated values. This is 103 // the number of rows written since the last time we flushed a page. 104 numBufferedRows int 105 106 // the total number of stored values in the current page. for repeated or optional 107 // values. this number may be lower than numBuffered 108 numDataValues int64 109 110 rowsWritten int 111 totalBytesWritten int64 112 // records the current number of compressed bytes in a column 113 totalCompressedBytes int64 114 closed bool 115 fallbackToNonDict bool 116 117 pages []DataPage 118 119 defLevelSink *encoding.PooledBufferWriter 120 repLevelSink *encoding.PooledBufferWriter 121 122 uncompressedData bytes.Buffer 123 compressedTemp *bytes.Buffer 124 125 currentEncoder encoding.TypedEncoder 126 } 127 128 func newColumnWriterBase(metaData *metadata.ColumnChunkMetaDataBuilder, pager PageWriter, useDict bool, enc parquet.Encoding, props *parquet.WriterProperties) columnWriter { 129 ret := columnWriter{ 130 metaData: metaData, 131 descr: metaData.Descr(), 132 levelInfo: computeLevelInfo(metaData.Descr()), 133 pager: pager, 134 hasDict: useDict, 135 encoding: enc, 136 props: props, 137 mem: props.Allocator(), 138 defLevelSink: encoding.NewPooledBufferWriter(0), 139 repLevelSink: encoding.NewPooledBufferWriter(0), 140 } 141 if pager.HasCompressor() { 142 ret.compressedTemp = new(bytes.Buffer) 143 } 144 if props.StatisticsEnabledFor(ret.descr.Path()) && ret.descr.SortOrder() != schema.SortUNKNOWN { 145 ret.pageStatistics = metadata.NewStatistics(ret.descr, props.Allocator()) 146 ret.chunkStatistics = metadata.NewStatistics(ret.descr, props.Allocator()) 147 } 148 149 ret.defEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxDefinitionLevel(), ret.defLevelSink) 150 ret.repEncoder.Init(parquet.Encodings.RLE, ret.descr.MaxRepetitionLevel(), ret.repLevelSink) 151 152 ret.reset() 153 154 return ret 155 } 156 157 func (w *columnWriter) SetBitsBuffer(buf *memory.Buffer) { w.bitsBuffer = buf } 158 159 func (w *columnWriter) LevelInfo() LevelInfo { return w.levelInfo } 160 161 func (w *columnWriter) Type() parquet.Type { 162 return w.descr.PhysicalType() 163 } 164 165 func (w *columnWriter) Descr() *schema.Column { 166 return w.descr 167 } 168 169 func (w *columnWriter) Properties() *parquet.WriterProperties { 170 return w.props 171 } 172 173 func (w *columnWriter) TotalCompressedBytes() int64 { 174 return w.totalCompressedBytes 175 } 176 177 func (w *columnWriter) TotalBytesWritten() int64 { 178 return w.totalBytesWritten 179 } 180 181 func (w *columnWriter) RowsWritten() int { 182 return w.rowsWritten + w.numBufferedRows 183 } 184 185 func (w *columnWriter) WriteDataPage(page DataPage) error { 186 written, err := w.pager.WriteDataPage(page) 187 w.totalBytesWritten += written 188 return err 189 } 190 191 func (w *columnWriter) WriteDefinitionLevels(levels []int16) { 192 w.defEncoder.EncodeNoFlush(levels) 193 } 194 195 func (w *columnWriter) WriteRepetitionLevels(levels []int16) { 196 w.repEncoder.EncodeNoFlush(levels) 197 } 198 199 func (w *columnWriter) reset() { 200 w.defLevelSink.Reset(0) 201 w.repLevelSink.Reset(0) 202 203 if w.props.DataPageVersion() == parquet.DataPageV1 { 204 // offset the buffers to make room to record the number of levels at the 205 // beginning of each after we've encoded them with RLE 206 if w.descr.MaxDefinitionLevel() > 0 { 207 w.defLevelSink.SetOffset(arrow.Uint32SizeBytes) 208 } 209 if w.descr.MaxRepetitionLevel() > 0 { 210 w.repLevelSink.SetOffset(arrow.Uint32SizeBytes) 211 } 212 } 213 214 w.defEncoder.Reset(w.descr.MaxDefinitionLevel()) 215 w.repEncoder.Reset(w.descr.MaxRepetitionLevel()) 216 } 217 218 func (w *columnWriter) concatBuffers(defLevelsSize, repLevelsSize int32, values []byte, wr io.Writer) { 219 wr.Write(w.repLevelSink.Bytes()[:repLevelsSize]) 220 wr.Write(w.defLevelSink.Bytes()[:defLevelsSize]) 221 wr.Write(values) 222 } 223 224 func (w *columnWriter) EstimatedBufferedValueBytes() int64 { 225 return w.currentEncoder.EstimatedDataEncodedSize() 226 } 227 228 func (w *columnWriter) commitWriteAndCheckPageLimit(numLevels, numValues int64) error { 229 w.numBufferedValues += numLevels 230 w.numDataValues += numValues 231 232 if w.currentEncoder.EstimatedDataEncodedSize() >= w.props.DataPageSize() { 233 return w.FlushCurrentPage() 234 } 235 return nil 236 } 237 238 func (w *columnWriter) FlushCurrentPage() error { 239 var ( 240 defLevelsRLESize int32 = 0 241 repLevelsRLESize int32 = 0 242 ) 243 244 values, err := w.currentEncoder.FlushValues() 245 if err != nil { 246 return err 247 } 248 defer values.Release() 249 250 isV1DataPage := w.props.DataPageVersion() == parquet.DataPageV1 251 if w.descr.MaxDefinitionLevel() > 0 { 252 w.defEncoder.Flush() 253 w.defLevelSink.SetOffset(0) 254 sz := w.defEncoder.Len() 255 if isV1DataPage { 256 sz += arrow.Uint32SizeBytes 257 binary.LittleEndian.PutUint32(w.defLevelSink.Bytes(), uint32(w.defEncoder.Len())) 258 } 259 defLevelsRLESize = int32(sz) 260 } 261 262 if w.descr.MaxRepetitionLevel() > 0 { 263 w.repEncoder.Flush() 264 w.repLevelSink.SetOffset(0) 265 if isV1DataPage { 266 binary.LittleEndian.PutUint32(w.repLevelSink.Bytes(), uint32(w.repEncoder.Len())) 267 } 268 repLevelsRLESize = int32(w.repLevelSink.Len()) 269 } 270 271 uncompressed := defLevelsRLESize + repLevelsRLESize + int32(values.Len()) 272 if isV1DataPage { 273 w.buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes()) 274 } else { 275 w.buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed, values.Bytes()) 276 } 277 278 w.reset() 279 w.rowsWritten += w.numBufferedRows 280 w.numBufferedValues, w.numDataValues, w.numBufferedRows = 0, 0, 0 281 return nil 282 } 283 284 func (w *columnWriter) buildDataPageV1(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error { 285 w.uncompressedData.Reset() 286 w.uncompressedData.Grow(int(uncompressed)) 287 w.concatBuffers(defLevelsRLESize, repLevelsRLESize, values, &w.uncompressedData) 288 289 pageStats, err := w.getPageStatistics() 290 if err != nil { 291 return err 292 } 293 pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path()))) 294 pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder() 295 w.resetPageStatistics() 296 297 var data []byte 298 if w.pager.HasCompressor() { 299 w.compressedTemp.Reset() 300 data = w.pager.Compress(w.compressedTemp, w.uncompressedData.Bytes()) 301 } else { 302 data = w.uncompressedData.Bytes() 303 } 304 305 // write the page to sink eagerly if there's no dictionary or if dictionary encoding has fallen back 306 if w.hasDict && !w.fallbackToNonDict { 307 pageSlice := make([]byte, len(data)) 308 copy(pageSlice, data) 309 page := NewDataPageV1WithStats(memory.NewBufferBytes(pageSlice), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats) 310 w.totalCompressedBytes += int64(page.buf.Len()) // + size of Pageheader 311 w.pages = append(w.pages, page) 312 } else { 313 w.totalCompressedBytes += int64(len(data)) 314 dp := NewDataPageV1WithStats(memory.NewBufferBytes(data), int32(w.numBufferedValues), w.encoding, parquet.Encodings.RLE, parquet.Encodings.RLE, uncompressed, pageStats) 315 defer dp.Release() 316 w.WriteDataPage(dp) 317 } 318 return nil 319 } 320 321 func (w *columnWriter) buildDataPageV2(defLevelsRLESize, repLevelsRLESize, uncompressed int32, values []byte) error { 322 var data []byte 323 if w.pager.HasCompressor() { 324 w.compressedTemp.Reset() 325 data = w.pager.Compress(w.compressedTemp, values) 326 } else { 327 data = values 328 } 329 330 // concatenate uncompressed levels and the possibly compressed values 331 var combined bytes.Buffer 332 combined.Grow(int(defLevelsRLESize + repLevelsRLESize + int32(len(data)))) 333 w.concatBuffers(defLevelsRLESize, repLevelsRLESize, data, &combined) 334 335 pageStats, err := w.getPageStatistics() 336 if err != nil { 337 return err 338 } 339 pageStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path()))) 340 pageStats.Signed = schema.SortSIGNED == w.descr.SortOrder() 341 w.resetPageStatistics() 342 343 numValues := int32(w.numBufferedValues) 344 numRows := int32(w.numBufferedRows) 345 nullCount := int32(pageStats.NullCount) 346 defLevelsByteLen := int32(defLevelsRLESize) 347 repLevelsByteLen := int32(repLevelsRLESize) 348 349 page := NewDataPageV2WithStats(memory.NewBufferBytes(combined.Bytes()), numValues, nullCount, numRows, w.encoding, 350 defLevelsByteLen, repLevelsByteLen, uncompressed, w.pager.HasCompressor(), pageStats) 351 if w.hasDict && !w.fallbackToNonDict { 352 w.totalCompressedBytes += int64(page.buf.Len()) // + sizeof pageheader 353 w.pages = append(w.pages, page) 354 } else { 355 w.totalCompressedBytes += int64(combined.Len()) 356 defer page.Release() 357 w.WriteDataPage(page) 358 } 359 return nil 360 } 361 362 func (w *columnWriter) FlushBufferedDataPages() { 363 if w.numBufferedValues > 0 { 364 w.FlushCurrentPage() 365 } 366 367 for _, p := range w.pages { 368 defer p.Release() 369 w.WriteDataPage(p) 370 } 371 w.pages = w.pages[:0] 372 w.totalCompressedBytes = 0 373 } 374 375 func (w *columnWriter) writeLevels(numValues int64, defLevels, repLevels []int16) int64 { 376 toWrite := int64(0) 377 // if the field is required and non-repeated, no definition levels 378 if defLevels != nil && w.descr.MaxDefinitionLevel() > 0 { 379 for _, v := range defLevels[:numValues] { 380 if v == w.descr.MaxDefinitionLevel() { 381 toWrite++ 382 } 383 } 384 w.WriteDefinitionLevels(defLevels[:numValues]) 385 } else { 386 toWrite = numValues 387 } 388 389 if repLevels != nil && w.descr.MaxRepetitionLevel() > 0 { 390 // a row could include more than one value 391 //count the occasions where we start a new row 392 for _, v := range repLevels[:numValues] { 393 if v == 0 { 394 w.numBufferedRows++ 395 } 396 } 397 398 w.WriteRepetitionLevels(repLevels[:numValues]) 399 } else { 400 // each value is exactly 1 row 401 w.numBufferedRows += int(numValues) 402 } 403 return toWrite 404 } 405 406 func (w *columnWriter) writeLevelsSpaced(numLevels int64, defLevels, repLevels []int16) { 407 if w.descr.MaxDefinitionLevel() > 0 { 408 w.WriteDefinitionLevels(defLevels[:numLevels]) 409 } 410 411 if w.descr.MaxRepetitionLevel() > 0 { 412 for _, v := range repLevels { 413 if v == 0 { 414 w.numBufferedRows++ 415 } 416 } 417 w.WriteRepetitionLevels(repLevels[:numLevels]) 418 } else { 419 w.numBufferedRows += int(numLevels) 420 } 421 } 422 423 func (w *columnWriter) WriteDictionaryPage() error { 424 dictEncoder := w.currentEncoder.(encoding.DictEncoder) 425 buffer := memory.NewResizableBuffer(w.mem) 426 buffer.Resize(dictEncoder.DictEncodedSize()) 427 dictEncoder.WriteDict(buffer.Bytes()) 428 defer buffer.Release() 429 430 page := NewDictionaryPage(buffer, int32(dictEncoder.NumEntries()), w.props.DictionaryPageEncoding()) 431 written, err := w.pager.WriteDictionaryPage(page) 432 w.totalBytesWritten += written 433 return err 434 } 435 436 type batchWriteInfo struct { 437 batchNum int64 438 nullCount int64 439 } 440 441 func (b batchWriteInfo) numSpaced() int64 { return b.batchNum + b.nullCount } 442 443 // this will always update the three output params 444 // outValsToWrite, outSpacedValsToWrite, and NullCount. Additionally 445 // it will update the validity bitmap if required (i.e. if at least one 446 // level of nullable structs directly precede the leaf node) 447 func (w *columnWriter) maybeCalculateValidityBits(defLevels []int16, batchSize int64) (out batchWriteInfo) { 448 if w.bitsBuffer == nil { 449 if w.levelInfo.DefLevel == 0 { 450 // in this case def levels should be null and we only 451 // need to output counts which will always be equal to 452 // the batch size passed in (max def level == 0 indicates 453 // there cannot be repeated or null fields) 454 out.batchNum = batchSize 455 out.nullCount = 0 456 } else { 457 var ( 458 toWrite int64 459 spacedToWrite int64 460 ) 461 for i := int64(0); i < batchSize; i++ { 462 if defLevels[i] == w.levelInfo.DefLevel { 463 toWrite++ 464 } 465 if defLevels[i] >= w.levelInfo.RepeatedAncestorDefLevel { 466 spacedToWrite++ 467 } 468 } 469 out.batchNum += toWrite 470 out.nullCount = spacedToWrite - toWrite 471 } 472 return 473 } 474 475 // shrink to fit possible causes another allocation 476 newBitmapSize := bitutil.BytesForBits(batchSize) 477 if newBitmapSize != int64(w.bitsBuffer.Len()) { 478 w.bitsBuffer.ResizeNoShrink(int(newBitmapSize)) 479 } 480 481 io := ValidityBitmapInputOutput{ 482 ValidBits: w.bitsBuffer.Bytes(), 483 ReadUpperBound: batchSize, 484 } 485 DefLevelsToBitmap(defLevels[:batchSize], w.levelInfo, &io) 486 out.batchNum = io.Read - io.NullCount 487 out.nullCount = io.NullCount 488 return 489 } 490 491 func (w *columnWriter) getPageStatistics() (enc metadata.EncodedStatistics, err error) { 492 if w.pageStatistics != nil { 493 enc, err = w.pageStatistics.Encode() 494 } 495 return 496 } 497 498 func (w *columnWriter) getChunkStatistics() (enc metadata.EncodedStatistics, err error) { 499 if w.chunkStatistics != nil { 500 enc, err = w.chunkStatistics.Encode() 501 } 502 return 503 } 504 505 func (w *columnWriter) resetPageStatistics() { 506 if w.chunkStatistics != nil { 507 w.chunkStatistics.Merge(w.pageStatistics) 508 w.pageStatistics.Reset() 509 } 510 } 511 512 func (w *columnWriter) Close() (err error) { 513 if !w.closed { 514 w.closed = true 515 if w.hasDict && !w.fallbackToNonDict { 516 w.WriteDictionaryPage() 517 } 518 519 w.FlushBufferedDataPages() 520 521 var chunkStats metadata.EncodedStatistics 522 chunkStats, err = w.getChunkStatistics() 523 if err != nil { 524 return err 525 } 526 527 chunkStats.ApplyStatSizeLimits(int(w.props.MaxStatsSizeFor(w.descr.Path()))) 528 chunkStats.Signed = schema.SortSIGNED == w.descr.SortOrder() 529 530 if w.rowsWritten > 0 && chunkStats.IsSet() { 531 w.metaData.SetStats(chunkStats) 532 } 533 err = w.pager.Close(w.hasDict, w.fallbackToNonDict) 534 535 w.defLevelSink.Reset(0) 536 w.repLevelSink.Reset(0) 537 } 538 return err 539 } 540 541 func (w *columnWriter) doBatches(total int64, repLevels []int16, action func(offset, batch int64)) { 542 batchSize := w.props.WriteBatchSize() 543 // if we're writing V1 data pages, have no replevels or the max replevel is 0 then just 544 // use the regular doBatches function 545 if w.props.DataPageVersion() == parquet.DataPageV1 || repLevels == nil || w.descr.MaxRepetitionLevel() == 0 { 546 doBatches(total, batchSize, action) 547 return 548 } 549 550 // if we get here that means we have repetition levels to write and we're writing 551 // V2 data pages. since we check whether to flush after each batch we write 552 // if we ensure all the batches begin and end on row boundaries we can avoid 553 // complex logic inside of our flushing or batch writing functions. 554 // the WriteBatch function recovers from panics so we can just panic here on a failure 555 // and it'll get caught by the WriteBatch functions above it 556 if int64(len(repLevels)) < total { 557 // if we're writing repLevels there has to be at least enough in the slice 558 // to write the total number that we're being asked to write 559 panic("columnwriter: not enough repetition levels for batch to write") 560 } 561 562 if repLevels[0] != 0 { 563 panic("columnwriter: batch writing for V2 data pages must start at a row boundary") 564 } 565 566 // loop by batchSize, but make sure we're ending/starting each batch on a row boundary 567 var ( 568 batchStart, batch int64 569 ) 570 for batchStart = 0; batchStart+batchSize < int64(len(repLevels)); batchStart += batch { 571 // check one past the last value of the batch for if it's a new row 572 // if it's not, shrink the batch and feel back to the beginning of a 573 // previous row boundary to end on 574 batch = batchSize 575 for ; repLevels[batchStart+batch] != 0; batch-- { 576 } 577 // batchStart <--> batch now begins and ends on a row boundary! 578 action(batchStart, batch) 579 } 580 action(batchStart, int64(len(repLevels))-batchStart) 581 } 582 583 func doBatches(total, batchSize int64, action func(offset, batch int64)) { 584 numBatches := total / batchSize 585 for i := int64(0); i < numBatches; i++ { 586 action(i*batchSize, batchSize) 587 } 588 if total%batchSize > 0 { 589 action(numBatches*batchSize, total%batchSize) 590 } 591 } 592 593 func levelSliceOrNil(rep []int16, offset, batch int64) []int16 { 594 if rep == nil { 595 return nil 596 } 597 return rep[offset : batch+offset] 598 } 599 600 func (w *ByteArrayColumnChunkWriter) maybeReplaceValidity(values array.Interface, newNullCount int64) array.Interface { 601 if w.bitsBuffer == nil { 602 return values 603 } 604 605 buffers := values.Data().Buffers() 606 if len(buffers) == 0 { 607 return values 608 } 609 // bitsBuffer should already be the offset slice of the validity bits 610 // we want so we don't need to manually slice the validity buffer 611 buffers[0] = w.bitsBuffer 612 613 if values.Data().Offset() > 0 { 614 data := values.Data() 615 buffers[1] = memory.NewBufferBytes(data.Buffers()[1].Bytes()[data.Offset()*arrow.Int32SizeBytes : data.Len()*arrow.Int32SizeBytes]) 616 } 617 return array.MakeFromData(array.NewData(values.DataType(), values.Len(), buffers, nil, int(newNullCount), 0)) 618 }