github.com/apache/arrow/go/v7@v7.0.1/parquet/file/record_reader.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package file 18 19 import ( 20 "sync/atomic" 21 "unsafe" 22 23 "github.com/JohnCGriffin/overflow" 24 "github.com/apache/arrow/go/v7/arrow" 25 "github.com/apache/arrow/go/v7/arrow/array" 26 "github.com/apache/arrow/go/v7/arrow/bitutil" 27 "github.com/apache/arrow/go/v7/arrow/memory" 28 "github.com/apache/arrow/go/v7/parquet" 29 "github.com/apache/arrow/go/v7/parquet/internal/encoding" 30 "github.com/apache/arrow/go/v7/parquet/internal/utils" 31 "github.com/apache/arrow/go/v7/parquet/schema" 32 "golang.org/x/xerrors" 33 ) 34 35 // RecordReader is an interface for reading entire records/rows at a time 36 // from a parquet file for both flat and nested columns. Properly delimiting 37 // semantic records according to the def and repetition levels. 38 type RecordReader interface { 39 // DefLevels returns the current crop of definition levels for this record 40 DefLevels() []int16 41 // LevelsPos is the number of definition / repetition levels (from the decoded ones) 42 // which the reader has already consumed. 43 LevelsPos() int64 44 // RepLevels returns the current decoded repetition levels 45 RepLevels() []int16 46 // Reset resets the state, clearing consumed values and repetition/definition 47 // levels as the result of calling ReadRecords 48 Reset() 49 // Reserve pre-allocates space for data 50 Reserve(int64) error 51 // HasMore returns true if there is more internal data which hasn't been 52 // processed yet. 53 HasMore() bool 54 // ReadRecords attempts to read the provided number of records from the 55 // column chunk, returning the number of records read and any error. 56 ReadRecords(num int64) (int64, error) 57 // ValuesWritten is the number of values written internally including any nulls 58 ValuesWritten() int 59 // ReleaseValidBits transfers the buffer of bits for the validity bitmap 60 // to the caller, subsequent calls will allocate a new one in the reader. 61 ReleaseValidBits() *memory.Buffer 62 // ReleaseValues transfers the buffer of data with the values to the caller, 63 // a new buffer will be allocated on subsequent calls. 64 ReleaseValues() *memory.Buffer 65 // NullCount returns the number of nulls decoded 66 NullCount() int64 67 // Type returns the parquet physical type of the column 68 Type() parquet.Type 69 // Values returns the decoded data buffer, including any nulls, without 70 // transferring ownership 71 Values() []byte 72 // SetPageReader allows progressing to the next column chunk while reusing 73 // this record reader by providing the page reader for the next chunk. 74 SetPageReader(PageReader) 75 // Retain increments the ref count by one 76 Retain() 77 // Release decrements the ref count by one, releasing the internal buffers when 78 // the ref count is 0. 79 Release() 80 } 81 82 // BinaryRecordReader provides an extra GetBuilderChunks function above and beyond 83 // the plain RecordReader to allow for efficiently building chunked arrays. 84 type BinaryRecordReader interface { 85 RecordReader 86 GetBuilderChunks() []arrow.Array 87 } 88 89 // recordReaderImpl is the internal interface implemented for different types 90 // enabling reuse of the higher level record reader logic. 91 type recordReaderImpl interface { 92 ColumnChunkReader 93 ReadValuesDense(int64) error 94 ReadValuesSpaced(int64, int64) error 95 ReserveValues(int64, bool) error 96 ResetValues() 97 GetValidBits() []byte 98 IncrementWritten(int64, int64) 99 ValuesWritten() int64 100 ReleaseValidBits() *memory.Buffer 101 ReleaseValues() *memory.Buffer 102 NullCount() int64 103 Values() []byte 104 SetPageReader(PageReader) 105 Retain() 106 Release() 107 } 108 109 type binaryRecordReaderImpl interface { 110 recordReaderImpl 111 GetBuilderChunks() []arrow.Array 112 } 113 114 // primitiveRecordReader is a record reader for primitive types, ie: not byte array or fixed len byte array 115 type primitiveRecordReader struct { 116 ColumnChunkReader 117 118 valuesWritten int64 119 valuesCap int64 120 nullCount int64 121 values *memory.Buffer 122 validBits *memory.Buffer 123 mem memory.Allocator 124 125 refCount int64 126 useValues bool 127 } 128 129 func createPrimitiveRecordReader(descr *schema.Column, mem memory.Allocator) primitiveRecordReader { 130 return primitiveRecordReader{ 131 ColumnChunkReader: NewColumnReader(descr, nil, mem), 132 values: memory.NewResizableBuffer(mem), 133 validBits: memory.NewResizableBuffer(mem), 134 mem: mem, 135 refCount: 1, 136 useValues: descr.PhysicalType() != parquet.Types.ByteArray && descr.PhysicalType() != parquet.Types.FixedLenByteArray, 137 } 138 } 139 140 func (pr *primitiveRecordReader) Retain() { 141 atomic.AddInt64(&pr.refCount, 1) 142 } 143 144 func (pr *primitiveRecordReader) Release() { 145 if atomic.AddInt64(&pr.refCount, -1) == 0 { 146 if pr.values != nil { 147 pr.values.Release() 148 pr.values = nil 149 } 150 if pr.validBits != nil { 151 pr.validBits.Release() 152 pr.validBits = nil 153 } 154 } 155 } 156 157 func (pr *primitiveRecordReader) SetPageReader(rdr PageReader) { 158 pr.ColumnChunkReader.setPageReader(rdr) 159 } 160 161 func (pr *primitiveRecordReader) ReleaseValidBits() *memory.Buffer { 162 res := pr.validBits 163 res.Resize(int(bitutil.BytesForBits(pr.valuesWritten))) 164 pr.validBits = memory.NewResizableBuffer(pr.mem) 165 return res 166 } 167 168 func (pr *primitiveRecordReader) ReleaseValues() (res *memory.Buffer) { 169 res = pr.values 170 nbytes, err := pr.numBytesForValues(pr.valuesWritten) 171 if err != nil { 172 panic(err) 173 } 174 res.Resize(int(nbytes)) 175 pr.values = memory.NewResizableBuffer(pr.mem) 176 pr.valuesCap = 0 177 178 return 179 } 180 181 func (pr *primitiveRecordReader) NullCount() int64 { return pr.nullCount } 182 183 func (pr *primitiveRecordReader) IncrementWritten(w, n int64) { 184 pr.valuesWritten += w 185 pr.nullCount += n 186 } 187 func (pr *primitiveRecordReader) GetValidBits() []byte { return pr.validBits.Bytes() } 188 func (pr *primitiveRecordReader) ValuesWritten() int64 { return pr.valuesWritten } 189 func (pr *primitiveRecordReader) Values() []byte { return pr.values.Bytes() } 190 func (pr *primitiveRecordReader) ResetValues() { 191 if pr.valuesWritten > 0 { 192 pr.values.ResizeNoShrink(0) 193 pr.validBits.ResizeNoShrink(0) 194 pr.valuesWritten = 0 195 pr.valuesCap = 0 196 pr.nullCount = 0 197 } 198 } 199 200 func (pr *primitiveRecordReader) numBytesForValues(nitems int64) (num int64, err error) { 201 typeSize := int64(pr.Descriptor().PhysicalType().ByteSize()) 202 var ok bool 203 if num, ok = overflow.Mul64(nitems, typeSize); !ok { 204 err = xerrors.New("total size of items too large") 205 } 206 return 207 } 208 209 func (pr *primitiveRecordReader) ReserveValues(extra int64, hasNullable bool) error { 210 newCap, err := updateCapacity(pr.valuesCap, pr.valuesWritten, extra) 211 if err != nil { 212 return err 213 } 214 if newCap > pr.valuesCap { 215 capBytes, err := pr.numBytesForValues(newCap) 216 if err != nil { 217 return err 218 } 219 if pr.useValues { 220 pr.values.ResizeNoShrink(int(capBytes)) 221 } 222 pr.valuesCap = newCap 223 } 224 if hasNullable { 225 validBytesCap := bitutil.BytesForBits(pr.valuesCap) 226 if pr.validBits.Len() < int(validBytesCap) { 227 pr.validBits.ResizeNoShrink(int(validBytesCap)) 228 } 229 } 230 return nil 231 } 232 233 func (pr *primitiveRecordReader) ReadValuesDense(toRead int64) (err error) { 234 switch cr := pr.ColumnChunkReader.(type) { 235 case *BooleanColumnChunkReader: 236 data := pr.values.Bytes()[int(pr.valuesWritten):] 237 values := *(*[]bool)(unsafe.Pointer(&data)) 238 _, err = cr.curDecoder.(encoding.BooleanDecoder).Decode(values[:toRead]) 239 case *Int32ColumnChunkReader: 240 values := arrow.Int32Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 241 _, err = cr.curDecoder.(encoding.Int32Decoder).Decode(values[:toRead]) 242 case *Int64ColumnChunkReader: 243 values := arrow.Int64Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 244 _, err = cr.curDecoder.(encoding.Int64Decoder).Decode(values[:toRead]) 245 case *Int96ColumnChunkReader: 246 values := parquet.Int96Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 247 _, err = cr.curDecoder.(encoding.Int96Decoder).Decode(values[:toRead]) 248 case *ByteArrayColumnChunkReader: 249 values := parquet.ByteArrayTraits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 250 _, err = cr.curDecoder.(encoding.ByteArrayDecoder).Decode(values[:toRead]) 251 case *FixedLenByteArrayColumnChunkReader: 252 values := parquet.FixedLenByteArrayTraits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 253 _, err = cr.curDecoder.(encoding.FixedLenByteArrayDecoder).Decode(values[:toRead]) 254 case *Float32ColumnChunkReader: 255 values := arrow.Float32Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 256 _, err = cr.curDecoder.(encoding.Float32Decoder).Decode(values[:toRead]) 257 case *Float64ColumnChunkReader: 258 values := arrow.Float64Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 259 _, err = cr.curDecoder.(encoding.Float64Decoder).Decode(values[:toRead]) 260 default: 261 panic("invalid type for record reader") 262 } 263 return 264 } 265 266 func (pr *primitiveRecordReader) ReadValuesSpaced(valuesWithNulls, nullCount int64) (err error) { 267 validBits := pr.validBits.Bytes() 268 offset := pr.valuesWritten 269 270 switch cr := pr.ColumnChunkReader.(type) { 271 case *BooleanColumnChunkReader: 272 data := pr.values.Bytes()[int(pr.valuesWritten):] 273 values := *(*[]bool)(unsafe.Pointer(&data)) 274 _, err = cr.curDecoder.(encoding.BooleanDecoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 275 case *Int32ColumnChunkReader: 276 values := arrow.Int32Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 277 _, err = cr.curDecoder.(encoding.Int32Decoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 278 case *Int64ColumnChunkReader: 279 values := arrow.Int64Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 280 _, err = cr.curDecoder.(encoding.Int64Decoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 281 case *Int96ColumnChunkReader: 282 values := parquet.Int96Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 283 _, err = cr.curDecoder.(encoding.Int96Decoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 284 case *ByteArrayColumnChunkReader: 285 values := parquet.ByteArrayTraits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 286 _, err = cr.curDecoder.(encoding.ByteArrayDecoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 287 case *FixedLenByteArrayColumnChunkReader: 288 values := parquet.FixedLenByteArrayTraits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 289 _, err = cr.curDecoder.(encoding.FixedLenByteArrayDecoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 290 case *Float32ColumnChunkReader: 291 values := arrow.Float32Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 292 _, err = cr.curDecoder.(encoding.Float32Decoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 293 case *Float64ColumnChunkReader: 294 values := arrow.Float64Traits.CastFromBytes(pr.values.Bytes())[int(pr.valuesWritten):] 295 _, err = cr.curDecoder.(encoding.Float64Decoder).DecodeSpaced(values[:int(valuesWithNulls)], int(nullCount), validBits, offset) 296 default: 297 panic("invalid type for record reader") 298 } 299 return 300 } 301 302 type recordReader struct { 303 recordReaderImpl 304 leafInfo LevelInfo 305 306 nullable bool 307 atRecStart bool 308 recordsRead int64 309 310 levelsWritten int64 311 levelsPos int64 312 levelsCap int64 313 314 defLevels *memory.Buffer 315 repLevels *memory.Buffer 316 317 readDict bool 318 refCount int64 319 } 320 321 // binaryRecordReader is the recordReaderImpl for non-primitive data 322 type binaryRecordReader struct { 323 *recordReader 324 } 325 326 func (b *binaryRecordReader) GetBuilderChunks() []arrow.Array { 327 return b.recordReaderImpl.(binaryRecordReaderImpl).GetBuilderChunks() 328 } 329 330 func newRecordReader(descr *schema.Column, info LevelInfo, mem memory.Allocator) RecordReader { 331 if mem == nil { 332 mem = memory.DefaultAllocator 333 } 334 335 pr := createPrimitiveRecordReader(descr, mem) 336 return &recordReader{ 337 refCount: 1, 338 recordReaderImpl: &pr, 339 leafInfo: info, 340 defLevels: memory.NewResizableBuffer(mem), 341 repLevels: memory.NewResizableBuffer(mem), 342 } 343 } 344 345 func (rr *recordReader) Retain() { 346 atomic.AddInt64(&rr.refCount, 1) 347 } 348 349 func (rr *recordReader) Release() { 350 if atomic.AddInt64(&rr.refCount, -1) == 0 { 351 rr.recordReaderImpl.Release() 352 rr.defLevels.Release() 353 rr.repLevels.Release() 354 rr.defLevels, rr.repLevels = nil, nil 355 } 356 } 357 358 func (rr *recordReader) DefLevels() []int16 { 359 return arrow.Int16Traits.CastFromBytes(rr.defLevels.Bytes()) 360 } 361 362 func (rr *recordReader) RepLevels() []int16 { 363 return arrow.Int16Traits.CastFromBytes(rr.repLevels.Bytes()) 364 } 365 366 func (rr *recordReader) HasMore() bool { 367 return rr.pager() != nil 368 } 369 370 func (rr *recordReader) SetPageReader(pr PageReader) { 371 rr.atRecStart = true 372 rr.recordReaderImpl.SetPageReader(pr) 373 } 374 375 func (rr *recordReader) ValuesWritten() int { 376 return int(rr.recordReaderImpl.ValuesWritten()) 377 } 378 379 func (rr *recordReader) LevelsPos() int64 { return rr.levelsPos } 380 381 func updateCapacity(cap, size, extra int64) (int64, error) { 382 if extra < 0 { 383 return 0, xerrors.New("negative size (corrupt file?)") 384 } 385 target, ok := overflow.Add64(size, extra) 386 if !ok { 387 return 0, xerrors.New("allocation size too large (corrupt file?)") 388 } 389 if target >= (1 << 62) { 390 return 0, xerrors.New("allocation size too large (corrupt file?)") 391 } 392 if cap >= target { 393 return cap, nil 394 } 395 return int64(bitutil.NextPowerOf2(int(target))), nil 396 } 397 398 func (rr *recordReader) Reserve(cap int64) error { 399 if err := rr.reserveLevels(cap); err != nil { 400 return err 401 } 402 if err := rr.reserveValues(cap); err != nil { 403 return err 404 } 405 return nil 406 } 407 408 func (rr *recordReader) reserveLevels(extra int64) error { 409 if rr.Descriptor().MaxDefinitionLevel() > 0 { 410 newCap, err := updateCapacity(rr.levelsCap, rr.levelsWritten, extra) 411 if err != nil { 412 return err 413 } 414 415 if newCap > rr.levelsCap { 416 capBytes, ok := overflow.Mul(int(newCap), arrow.Int16SizeBytes) 417 if !ok { 418 return xerrors.Errorf("allocation size too large (corrupt file?)") 419 } 420 rr.defLevels.ResizeNoShrink(capBytes) 421 if rr.Descriptor().MaxRepetitionLevel() > 0 { 422 rr.repLevels.ResizeNoShrink(capBytes) 423 } 424 rr.levelsCap = newCap 425 } 426 } 427 return nil 428 } 429 430 func (rr *recordReader) reserveValues(extra int64) error { 431 return rr.recordReaderImpl.ReserveValues(extra, rr.leafInfo.HasNullableValues()) 432 } 433 434 func (rr *recordReader) resetValues() { 435 rr.recordReaderImpl.ResetValues() 436 } 437 438 func (rr *recordReader) Reset() { 439 rr.resetValues() 440 441 if rr.levelsWritten > 0 { 442 remain := int(rr.levelsWritten - rr.levelsPos) 443 // shift remaining levels to beginning of buffer and trim only the 444 // number decoded remaining 445 defData := rr.DefLevels() 446 447 copy(defData, defData[int(rr.levelsPos):int(rr.levelsWritten)]) 448 rr.defLevels.ResizeNoShrink(remain * int(arrow.Int16SizeBytes)) 449 450 if rr.Descriptor().MaxRepetitionLevel() > 0 { 451 repData := rr.RepLevels() 452 copy(repData, repData[int(rr.levelsPos):int(rr.levelsWritten)]) 453 rr.repLevels.ResizeNoShrink(remain * int(arrow.Int16SizeBytes)) 454 } 455 456 rr.levelsWritten -= rr.levelsPos 457 rr.levelsPos = 0 458 rr.levelsCap = int64(remain) 459 } 460 461 rr.recordsRead = 0 462 } 463 464 // process written rep/def levels to read the end of records 465 // process no more levels than necessary to delimit the indicated 466 // number of logical records. updates internal state of recordreader 467 // returns number of records delimited 468 func (rr *recordReader) delimitRecords(numRecords int64) (recordsRead, valsToRead int64) { 469 var ( 470 curRep int16 471 curDef int16 472 ) 473 474 defLevels := rr.DefLevels()[int(rr.levelsPos):] 475 repLevels := rr.RepLevels()[int(rr.levelsPos):] 476 477 for rr.levelsPos < rr.levelsWritten { 478 curRep, repLevels = repLevels[0], repLevels[1:] 479 if curRep == 0 { 480 // if at record start, we are seeing the start of a record 481 // for the second time, such as after repeated calls to delimitrecords. 482 // in this case we must continue until we find another record start 483 // or exaust the column chunk 484 if !rr.atRecStart { 485 // end of a record, increment count 486 recordsRead++ 487 if recordsRead == numRecords { 488 // found the number of records we wanted, set record start to true and break 489 rr.atRecStart = true 490 break 491 } 492 } 493 } 494 // we have decided to consume the level at this position 495 // advance until we find another boundary 496 rr.atRecStart = false 497 498 curDef, defLevels = defLevels[0], defLevels[1:] 499 if curDef == rr.Descriptor().MaxDefinitionLevel() { 500 valsToRead++ 501 } 502 rr.levelsPos++ 503 } 504 return 505 } 506 507 func (rr *recordReader) ReadRecordData(numRecords int64) (int64, error) { 508 possibleNum := utils.Max(numRecords, rr.levelsWritten-rr.levelsPos) 509 if err := rr.reserveValues(possibleNum); err != nil { 510 return 0, err 511 } 512 513 var ( 514 startPos = rr.levelsPos 515 valuesToRead int64 516 recordsRead int64 517 nullCount int64 518 err error 519 ) 520 521 if rr.Descriptor().MaxRepetitionLevel() > 0 { 522 recordsRead, valuesToRead = rr.delimitRecords(numRecords) 523 } else if rr.Descriptor().MaxDefinitionLevel() > 0 { 524 // no repetition levels, skip delimiting logic. each level 525 // represents null or not null entry 526 recordsRead = utils.Min(rr.levelsWritten-rr.levelsPos, numRecords) 527 // this is advanced by delimitRecords which we skipped 528 rr.levelsPos += recordsRead 529 } else { 530 recordsRead, valuesToRead = numRecords, numRecords 531 } 532 533 if rr.leafInfo.HasNullableValues() { 534 validityIO := ValidityBitmapInputOutput{ 535 ReadUpperBound: rr.levelsPos - startPos, 536 ValidBits: rr.GetValidBits(), 537 ValidBitsOffset: rr.recordReaderImpl.ValuesWritten(), 538 } 539 DefLevelsToBitmap(rr.DefLevels()[startPos:int(rr.levelsPos)], rr.leafInfo, &validityIO) 540 valuesToRead = validityIO.Read - validityIO.NullCount 541 nullCount = validityIO.NullCount 542 err = rr.ReadValuesSpaced(validityIO.Read, nullCount) 543 } else { 544 err = rr.ReadValuesDense(valuesToRead) 545 } 546 if err != nil { 547 return 0, err 548 } 549 550 if rr.leafInfo.DefLevel > 0 { 551 rr.consumeBufferedValues(rr.levelsPos - startPos) 552 } else { 553 rr.consumeBufferedValues(valuesToRead) 554 } 555 556 // total values, including nullspaces if any 557 rr.IncrementWritten(valuesToRead+nullCount, nullCount) 558 return recordsRead, nil 559 } 560 561 const minLevelBatchSize = 1024 562 563 func (rr *recordReader) ReadRecords(numRecords int64) (int64, error) { 564 // delimit records, then read values at the end 565 recordsRead := int64(0) 566 567 if rr.levelsPos < rr.levelsWritten { 568 additional, err := rr.ReadRecordData(numRecords) 569 if err != nil { 570 return 0, err 571 } 572 recordsRead += additional 573 } 574 575 levelBatch := utils.Max(minLevelBatchSize, numRecords) 576 577 // if we are in the middle of a record, continue until reaching 578 // the desired number of records or the end of the current record 579 // if we have enough 580 for !rr.atRecStart || recordsRead < numRecords { 581 // is there more data in this row group? 582 if !rr.HasNext() { 583 if !rr.atRecStart { 584 // ended the row group while inside a record we haven't seen 585 // the end of yet. increment the record count for the last record 586 // in the row group 587 recordsRead++ 588 rr.atRecStart = true 589 } 590 break 591 } 592 593 // we perform multiple batch reads until we either exhaust the row group 594 // or observe the desired number of records 595 batchSize := utils.Min(levelBatch, rr.numAvailValues()) 596 if batchSize == 0 { 597 // no more data in column 598 break 599 } 600 601 if rr.Descriptor().MaxDefinitionLevel() > 0 { 602 if err := rr.reserveLevels(batchSize); err != nil { 603 return 0, err 604 } 605 606 defLevels := rr.DefLevels()[int(rr.levelsWritten):] 607 608 levelsRead := 0 609 // not present for non-repeated fields 610 if rr.Descriptor().MaxRepetitionLevel() > 0 { 611 repLevels := rr.RepLevels()[int(rr.levelsWritten):] 612 levelsRead, _ = rr.readDefinitionLevels(defLevels[:batchSize]) 613 if rr.readRepetitionLevels(repLevels[:batchSize]) != levelsRead { 614 return 0, xerrors.New("number of decoded rep/def levels did not match") 615 } 616 } else if rr.Descriptor().MaxDefinitionLevel() > 0 { 617 levelsRead, _ = rr.readDefinitionLevels(defLevels[:batchSize]) 618 } 619 620 if levelsRead == 0 { 621 // exhausted column chunk 622 break 623 } 624 625 rr.levelsWritten += int64(levelsRead) 626 read, err := rr.ReadRecordData(numRecords - recordsRead) 627 if err != nil { 628 return recordsRead, err 629 } 630 recordsRead += read 631 } else { 632 // no rep or def levels 633 batchSize = utils.Min(numRecords-recordsRead, batchSize) 634 read, err := rr.ReadRecordData(batchSize) 635 if err != nil { 636 return recordsRead, err 637 } 638 recordsRead += read 639 } 640 } 641 642 return recordsRead, nil 643 } 644 645 func (rr *recordReader) ReleaseValidBits() *memory.Buffer { 646 if rr.leafInfo.HasNullableValues() { 647 return rr.recordReaderImpl.ReleaseValidBits() 648 } 649 return nil 650 } 651 652 // flbaRecordReader is the specialization for optimizing reading fixed-length 653 // byte array records. 654 type flbaRecordReader struct { 655 primitiveRecordReader 656 657 bldr *array.FixedSizeBinaryBuilder 658 valueBuf []parquet.FixedLenByteArray 659 } 660 661 func (fr *flbaRecordReader) ReserveValues(extra int64, hasNullable bool) error { 662 fr.bldr.Reserve(int(extra)) 663 return fr.primitiveRecordReader.ReserveValues(extra, hasNullable) 664 } 665 666 func (fr *flbaRecordReader) Retain() { 667 fr.bldr.Retain() 668 fr.primitiveRecordReader.Retain() 669 } 670 671 func (fr *flbaRecordReader) Release() { 672 fr.bldr.Release() 673 fr.primitiveRecordReader.Release() 674 } 675 676 func (fr *flbaRecordReader) ReadValuesDense(toRead int64) error { 677 if int64(cap(fr.valueBuf)) < toRead { 678 fr.valueBuf = make([]parquet.FixedLenByteArray, 0, toRead) 679 } 680 681 values := fr.valueBuf[:toRead] 682 dec := fr.ColumnChunkReader.(*FixedLenByteArrayColumnChunkReader).curDecoder.(encoding.FixedLenByteArrayDecoder) 683 684 _, err := dec.Decode(values) 685 if err != nil { 686 return err 687 } 688 689 for _, val := range values { 690 fr.bldr.Append(val) 691 } 692 fr.ResetValues() 693 return nil 694 } 695 696 func (fr *flbaRecordReader) ReadValuesSpaced(valuesWithNulls, nullCount int64) error { 697 validBits := fr.validBits.Bytes() 698 offset := fr.valuesWritten 699 700 if int64(cap(fr.valueBuf)) < valuesWithNulls { 701 fr.valueBuf = make([]parquet.FixedLenByteArray, 0, valuesWithNulls) 702 } 703 704 values := fr.valueBuf[:valuesWithNulls] 705 dec := fr.ColumnChunkReader.(*FixedLenByteArrayColumnChunkReader).curDecoder.(encoding.FixedLenByteArrayDecoder) 706 _, err := dec.DecodeSpaced(values, int(nullCount), validBits, offset) 707 if err != nil { 708 return err 709 } 710 711 for idx, val := range values { 712 if bitutil.BitIsSet(validBits, int(offset)+idx) { 713 fr.bldr.Append(val) 714 } else { 715 fr.bldr.AppendNull() 716 } 717 } 718 fr.ResetValues() 719 return nil 720 } 721 722 func (fr *flbaRecordReader) GetBuilderChunks() []arrow.Array { 723 return []arrow.Array{fr.bldr.NewArray()} 724 } 725 726 func newFLBARecordReader(descr *schema.Column, info LevelInfo, mem memory.Allocator) RecordReader { 727 if mem == nil { 728 mem = memory.DefaultAllocator 729 } 730 731 byteWidth := descr.TypeLength() 732 733 return &binaryRecordReader{&recordReader{ 734 recordReaderImpl: &flbaRecordReader{ 735 createPrimitiveRecordReader(descr, mem), 736 array.NewFixedSizeBinaryBuilder(mem, &arrow.FixedSizeBinaryType{ByteWidth: byteWidth}), 737 nil, 738 }, 739 leafInfo: info, 740 defLevels: memory.NewResizableBuffer(mem), 741 repLevels: memory.NewResizableBuffer(mem), 742 refCount: 1, 743 }} 744 } 745 746 // byteArrayRecordReader is the specialization impl for byte-array columns 747 type byteArrayRecordReader struct { 748 primitiveRecordReader 749 750 bldr *array.BinaryBuilder 751 valueBuf []parquet.ByteArray 752 } 753 754 func newByteArrayRecordReader(descr *schema.Column, info LevelInfo, mem memory.Allocator) RecordReader { 755 if mem == nil { 756 mem = memory.DefaultAllocator 757 } 758 759 dt := arrow.BinaryTypes.Binary 760 if descr.LogicalType().Equals(schema.StringLogicalType{}) { 761 dt = arrow.BinaryTypes.String 762 } 763 764 return &binaryRecordReader{&recordReader{ 765 recordReaderImpl: &byteArrayRecordReader{ 766 createPrimitiveRecordReader(descr, mem), 767 array.NewBinaryBuilder(mem, dt), 768 nil, 769 }, 770 leafInfo: info, 771 defLevels: memory.NewResizableBuffer(mem), 772 repLevels: memory.NewResizableBuffer(mem), 773 refCount: 1, 774 }} 775 } 776 777 func (fr *byteArrayRecordReader) ReserveValues(extra int64, hasNullable bool) error { 778 fr.bldr.Reserve(int(extra)) 779 return fr.primitiveRecordReader.ReserveValues(extra, hasNullable) 780 } 781 782 func (fr *byteArrayRecordReader) Retain() { 783 fr.bldr.Retain() 784 fr.primitiveRecordReader.Retain() 785 } 786 787 func (fr *byteArrayRecordReader) Release() { 788 fr.bldr.Release() 789 fr.primitiveRecordReader.Release() 790 } 791 792 func (br *byteArrayRecordReader) ReadValuesDense(toRead int64) error { 793 if int64(cap(br.valueBuf)) < toRead { 794 br.valueBuf = make([]parquet.ByteArray, 0, toRead) 795 } 796 797 values := br.valueBuf[:toRead] 798 dec := br.ColumnChunkReader.(*ByteArrayColumnChunkReader).curDecoder.(encoding.ByteArrayDecoder) 799 800 _, err := dec.Decode(values) 801 if err != nil { 802 return err 803 } 804 805 for _, val := range values { 806 br.bldr.Append(val) 807 } 808 br.ResetValues() 809 return nil 810 } 811 812 func (br *byteArrayRecordReader) ReadValuesSpaced(valuesWithNulls, nullCount int64) error { 813 validBits := br.validBits.Bytes() 814 offset := br.valuesWritten 815 816 if int64(cap(br.valueBuf)) < valuesWithNulls { 817 br.valueBuf = make([]parquet.ByteArray, 0, valuesWithNulls) 818 } 819 820 values := br.valueBuf[:valuesWithNulls] 821 dec := br.ColumnChunkReader.(*ByteArrayColumnChunkReader).curDecoder.(encoding.ByteArrayDecoder) 822 _, err := dec.DecodeSpaced(values, int(nullCount), validBits, offset) 823 if err != nil { 824 return err 825 } 826 827 for idx, val := range values { 828 if bitutil.BitIsSet(validBits, int(offset)+idx) { 829 br.bldr.Append(val) 830 } else { 831 br.bldr.AppendNull() 832 } 833 } 834 br.ResetValues() 835 return nil 836 } 837 838 func (br *byteArrayRecordReader) GetBuilderChunks() []arrow.Array { 839 return []arrow.Array{br.bldr.NewArray()} 840 } 841 842 // TODO(mtopol): create optimized readers for dictionary types after ARROW-7286 is done 843 844 func NewRecordReader(descr *schema.Column, info LevelInfo, readDict bool, mem memory.Allocator) RecordReader { 845 switch descr.PhysicalType() { 846 case parquet.Types.ByteArray: 847 return newByteArrayRecordReader(descr, info, mem) 848 case parquet.Types.FixedLenByteArray: 849 return newFLBARecordReader(descr, info, mem) 850 default: 851 return newRecordReader(descr, info, mem) 852 } 853 }