github.com/apache/arrow/go/v7@v7.0.1/parquet/pqarrow/column_readers.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 package pqarrow 18 19 import ( 20 "encoding/binary" 21 "reflect" 22 "sync/atomic" 23 "time" 24 "unsafe" 25 26 "github.com/apache/arrow/go/v7/arrow" 27 "github.com/apache/arrow/go/v7/arrow/array" 28 "github.com/apache/arrow/go/v7/arrow/bitutil" 29 "github.com/apache/arrow/go/v7/arrow/decimal128" 30 "github.com/apache/arrow/go/v7/arrow/memory" 31 "github.com/apache/arrow/go/v7/parquet" 32 "github.com/apache/arrow/go/v7/parquet/file" 33 "github.com/apache/arrow/go/v7/parquet/internal/utils" 34 "github.com/apache/arrow/go/v7/parquet/schema" 35 "golang.org/x/xerrors" 36 ) 37 38 // column reader for leaf columns (non-nested) 39 type leafReader struct { 40 out *arrow.Chunked 41 rctx *readerCtx 42 field *arrow.Field 43 input *columnIterator 44 descr *schema.Column 45 recordRdr file.RecordReader 46 47 refCount int64 48 } 49 50 func newLeafReader(rctx *readerCtx, field *arrow.Field, input *columnIterator, leafInfo file.LevelInfo) (*ColumnReader, error) { 51 ret := &leafReader{ 52 rctx: rctx, 53 field: field, 54 input: input, 55 descr: input.Descr(), 56 recordRdr: file.NewRecordReader(input.Descr(), leafInfo, field.Type.ID() == arrow.DICTIONARY, rctx.mem), 57 refCount: 1, 58 } 59 err := ret.nextRowGroup() 60 return &ColumnReader{ret}, err 61 } 62 63 func (lr *leafReader) Retain() { 64 atomic.AddInt64(&lr.refCount, 1) 65 } 66 67 func (lr *leafReader) Release() { 68 if atomic.AddInt64(&lr.refCount, -1) == 0 { 69 if lr.out != nil { 70 lr.out.Release() 71 lr.out = nil 72 } 73 if lr.recordRdr != nil { 74 lr.recordRdr.Release() 75 lr.recordRdr = nil 76 } 77 } 78 } 79 80 func (lr *leafReader) GetDefLevels() ([]int16, error) { 81 return lr.recordRdr.DefLevels()[:int(lr.recordRdr.LevelsPos())], nil 82 } 83 84 func (lr *leafReader) GetRepLevels() ([]int16, error) { 85 return lr.recordRdr.RepLevels()[:int(lr.recordRdr.LevelsPos())], nil 86 } 87 88 func (lr *leafReader) IsOrHasRepeatedChild() bool { return false } 89 90 func (lr *leafReader) LoadBatch(nrecords int64) (err error) { 91 if lr.out != nil { 92 lr.out.Release() 93 lr.out = nil 94 } 95 lr.recordRdr.Reset() 96 97 if err := lr.recordRdr.Reserve(nrecords); err != nil { 98 return err 99 } 100 for nrecords > 0 { 101 if !lr.recordRdr.HasMore() { 102 break 103 } 104 numRead, err := lr.recordRdr.ReadRecords(nrecords) 105 if err != nil { 106 return err 107 } 108 nrecords -= numRead 109 if numRead == 0 { 110 if err = lr.nextRowGroup(); err != nil { 111 return err 112 } 113 } 114 } 115 lr.out, err = transferColumnData(lr.recordRdr, lr.field.Type, lr.descr, lr.rctx.mem) 116 return 117 } 118 119 func (lr *leafReader) BuildArray(_ int64) (*arrow.Chunked, error) { 120 return lr.out, nil 121 } 122 123 func (lr *leafReader) Field() *arrow.Field { return lr.field } 124 125 func (lr *leafReader) nextRowGroup() error { 126 pr, err := lr.input.NextChunk() 127 if err != nil { 128 return err 129 } 130 lr.recordRdr.SetPageReader(pr) 131 return nil 132 } 133 134 // column reader for struct arrays, has readers for each child which could 135 // themselves be nested or leaf columns. 136 type structReader struct { 137 rctx *readerCtx 138 filtered *arrow.Field 139 levelInfo file.LevelInfo 140 children []*ColumnReader 141 defRepLevelChild *ColumnReader 142 hasRepeatedChild bool 143 144 refCount int64 145 } 146 147 func (sr *structReader) Retain() { 148 atomic.AddInt64(&sr.refCount, 1) 149 } 150 151 func (sr *structReader) Release() { 152 if atomic.AddInt64(&sr.refCount, -1) == 0 { 153 if sr.defRepLevelChild != nil { 154 sr.defRepLevelChild.Release() 155 sr.defRepLevelChild = nil 156 } 157 for _, c := range sr.children { 158 c.Release() 159 } 160 sr.children = nil 161 } 162 } 163 164 func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo file.LevelInfo, children []*ColumnReader) *ColumnReader { 165 // there could be a mix of children some might be repeated and some might not be 166 // if possible use one that isn't since that will be guaranteed to have the least 167 // number of levels to reconstruct a nullable bitmap 168 var result *ColumnReader 169 for _, child := range children { 170 if !child.IsOrHasRepeatedChild() { 171 result = child 172 } 173 } 174 175 ret := &structReader{ 176 rctx: rctx, 177 filtered: filtered, 178 levelInfo: levelInfo, 179 children: children, 180 refCount: 1, 181 } 182 if result != nil { 183 ret.defRepLevelChild = result 184 ret.hasRepeatedChild = false 185 } else { 186 ret.defRepLevelChild = children[0] 187 ret.hasRepeatedChild = true 188 } 189 ret.defRepLevelChild.Retain() 190 return &ColumnReader{ret} 191 } 192 193 func (sr *structReader) IsOrHasRepeatedChild() bool { return sr.hasRepeatedChild } 194 195 func (sr *structReader) GetDefLevels() ([]int16, error) { 196 if len(sr.children) == 0 { 197 return nil, xerrors.New("struct raeder has no children") 198 } 199 200 // this method should only be called when this struct or one of its parents 201 // are optional/repeated or has a repeated child 202 // meaning all children must have rep/def levels associated with them 203 return sr.defRepLevelChild.GetDefLevels() 204 } 205 206 func (sr *structReader) GetRepLevels() ([]int16, error) { 207 if len(sr.children) == 0 { 208 return nil, xerrors.New("struct raeder has no children") 209 } 210 211 // this method should only be called when this struct or one of its parents 212 // are optional/repeated or has a repeated child 213 // meaning all children must have rep/def levels associated with them 214 return sr.defRepLevelChild.GetRepLevels() 215 } 216 217 func (sr *structReader) LoadBatch(nrecords int64) error { 218 for _, rdr := range sr.children { 219 if err := rdr.LoadBatch(nrecords); err != nil { 220 return err 221 } 222 } 223 return nil 224 } 225 226 func (sr *structReader) Field() *arrow.Field { return sr.filtered } 227 228 func (sr *structReader) BuildArray(lenBound int64) (*arrow.Chunked, error) { 229 validityIO := file.ValidityBitmapInputOutput{ 230 ReadUpperBound: lenBound, 231 Read: lenBound, 232 } 233 234 var nullBitmap *memory.Buffer 235 236 if sr.hasRepeatedChild { 237 nullBitmap = memory.NewResizableBuffer(sr.rctx.mem) 238 nullBitmap.Resize(int(bitutil.BytesForBits(lenBound))) 239 validityIO.ValidBits = nullBitmap.Bytes() 240 defLevels, err := sr.GetDefLevels() 241 if err != nil { 242 return nil, err 243 } 244 repLevels, err := sr.GetRepLevels() 245 if err != nil { 246 return nil, err 247 } 248 249 if err := file.DefRepLevelsToBitmap(defLevels, repLevels, sr.levelInfo, &validityIO); err != nil { 250 return nil, err 251 } 252 253 } else if sr.filtered.Nullable { 254 nullBitmap = memory.NewResizableBuffer(sr.rctx.mem) 255 nullBitmap.Resize(int(bitutil.BytesForBits(lenBound))) 256 validityIO.ValidBits = nullBitmap.Bytes() 257 defLevels, err := sr.GetDefLevels() 258 if err != nil { 259 return nil, err 260 } 261 262 file.DefLevelsToBitmap(defLevels, sr.levelInfo, &validityIO) 263 } 264 265 if nullBitmap != nil { 266 nullBitmap.Resize(int(bitutil.BytesForBits(validityIO.Read))) 267 } 268 269 childArrData := make([]arrow.ArrayData, 0) 270 // gather children arrays and def levels 271 for _, child := range sr.children { 272 field, err := child.BuildArray(validityIO.Read) 273 if err != nil { 274 return nil, err 275 } 276 arrdata, err := chunksToSingle(field) 277 if err != nil { 278 return nil, err 279 } 280 childArrData = append(childArrData, arrdata) 281 } 282 283 if !sr.filtered.Nullable && !sr.hasRepeatedChild { 284 validityIO.Read = int64(childArrData[0].Len()) 285 } 286 287 buffers := make([]*memory.Buffer, 1) 288 if validityIO.NullCount > 0 { 289 buffers[0] = nullBitmap 290 } 291 292 data := array.NewData(sr.filtered.Type, int(validityIO.Read), buffers, childArrData, int(validityIO.NullCount), 0) 293 defer data.Release() 294 arr := array.MakeFromData(data) 295 defer arr.Release() 296 return arrow.NewChunked(sr.filtered.Type, []arrow.Array{arr}), nil 297 } 298 299 // column reader for repeated columns specifically for list arrays 300 type listReader struct { 301 rctx *readerCtx 302 field *arrow.Field 303 info file.LevelInfo 304 itemRdr *ColumnReader 305 306 refCount int64 307 } 308 309 func newListReader(rctx *readerCtx, field *arrow.Field, info file.LevelInfo, childRdr *ColumnReader) *ColumnReader { 310 childRdr.Retain() 311 return &ColumnReader{&listReader{rctx, field, info, childRdr, 1}} 312 } 313 314 func (lr *listReader) Retain() { 315 atomic.AddInt64(&lr.refCount, 1) 316 } 317 318 func (lr *listReader) Release() { 319 if atomic.AddInt64(&lr.refCount, -1) == 0 { 320 if lr.itemRdr != nil { 321 lr.itemRdr.Release() 322 lr.itemRdr = nil 323 } 324 } 325 } 326 327 func (lr *listReader) GetDefLevels() ([]int16, error) { 328 return lr.itemRdr.GetDefLevels() 329 } 330 331 func (lr *listReader) GetRepLevels() ([]int16, error) { 332 return lr.itemRdr.GetRepLevels() 333 } 334 335 func (lr *listReader) Field() *arrow.Field { return lr.field } 336 337 func (lr *listReader) IsOrHasRepeatedChild() bool { return true } 338 339 func (lr *listReader) LoadBatch(nrecords int64) error { 340 return lr.itemRdr.LoadBatch(nrecords) 341 } 342 343 func (lr *listReader) BuildArray(lenBound int64) (*arrow.Chunked, error) { 344 var ( 345 defLevels []int16 346 repLevels []int16 347 err error 348 validityBuffer *memory.Buffer 349 ) 350 351 if defLevels, err = lr.itemRdr.GetDefLevels(); err != nil { 352 return nil, err 353 } 354 if repLevels, err = lr.itemRdr.GetRepLevels(); err != nil { 355 return nil, err 356 } 357 358 validityIO := file.ValidityBitmapInputOutput{ReadUpperBound: lenBound} 359 if lr.field.Nullable { 360 validityBuffer = memory.NewResizableBuffer(lr.rctx.mem) 361 validityBuffer.Resize(int(bitutil.BytesForBits(lenBound))) 362 defer validityBuffer.Release() 363 validityIO.ValidBits = validityBuffer.Bytes() 364 } 365 offsetsBuffer := memory.NewResizableBuffer(lr.rctx.mem) 366 offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1)) 367 defer offsetsBuffer.Release() 368 369 offsetData := arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes()) 370 if err = file.DefRepLevelsToListInfo(defLevels, repLevels, lr.info, &validityIO, offsetData); err != nil { 371 return nil, err 372 } 373 374 arr, err := lr.itemRdr.BuildArray(int64(offsetData[int(validityIO.Read)])) 375 if err != nil { 376 return nil, err 377 } 378 379 // resize to actual number of elems returned 380 offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1)) 381 if validityBuffer != nil { 382 validityBuffer.Resize(int(bitutil.BytesForBits(validityIO.Read))) 383 } 384 385 item, err := chunksToSingle(arr) 386 if err != nil { 387 return nil, err 388 } 389 defer item.Release() 390 391 buffers := []*memory.Buffer{nil, offsetsBuffer} 392 if validityIO.NullCount > 0 { 393 buffers[0] = validityBuffer 394 } 395 396 data := array.NewData(lr.field.Type, int(validityIO.Read), buffers, []arrow.ArrayData{item}, int(validityIO.NullCount), 0) 397 defer data.Release() 398 if lr.field.Type.ID() == arrow.FIXED_SIZE_LIST { 399 defer data.Buffers()[1].Release() 400 listSize := lr.field.Type.(*arrow.FixedSizeListType).Len() 401 for x := 1; x < data.Len(); x++ { 402 size := offsetData[x] - offsetData[x-1] 403 if size != listSize { 404 return nil, xerrors.Errorf("expected all lists to be of size=%d, but index %d had size=%d", listSize, x, size) 405 } 406 } 407 data.Buffers()[1] = nil 408 } 409 out := array.MakeFromData(data) 410 defer out.Release() 411 return arrow.NewChunked(lr.field.Type, []arrow.Array{out}), nil 412 } 413 414 // column reader logic for fixed size lists instead of variable length ones. 415 type fixedSizeListReader struct { 416 listReader 417 } 418 419 func newFixedSizeListReader(rctx *readerCtx, field *arrow.Field, info file.LevelInfo, childRdr *ColumnReader) *ColumnReader { 420 childRdr.Retain() 421 return &ColumnReader{&fixedSizeListReader{listReader{rctx, field, info, childRdr, 1}}} 422 } 423 424 // helper function to combine chunks into a single array. 425 // 426 // nested data conversion for chunked array outputs not yet implemented 427 func chunksToSingle(chunked *arrow.Chunked) (arrow.ArrayData, error) { 428 switch len(chunked.Chunks()) { 429 case 0: 430 return array.NewData(chunked.DataType(), 0, []*memory.Buffer{nil, nil}, nil, 0, 0), nil 431 case 1: 432 return chunked.Chunk(0).Data(), nil 433 default: // if an item reader yields a chunked array, this is not yet implemented 434 return nil, xerrors.New("not implemented") 435 } 436 } 437 438 // create a chunked arrow array from the raw record data 439 func transferColumnData(rdr file.RecordReader, valueType arrow.DataType, descr *schema.Column, mem memory.Allocator) (*arrow.Chunked, error) { 440 var data arrow.ArrayData 441 switch valueType.ID() { 442 // case arrow.DICTIONARY: 443 case arrow.NULL: 444 return arrow.NewChunked(arrow.Null, []arrow.Array{array.NewNull(rdr.ValuesWritten())}), nil 445 case arrow.INT32, arrow.INT64, arrow.FLOAT32, arrow.FLOAT64: 446 data = transferZeroCopy(rdr, valueType) // can just reference the raw data without copying 447 case arrow.BOOL: 448 data = transferBool(rdr) 449 case arrow.UINT8, 450 arrow.UINT16, 451 arrow.UINT32, 452 arrow.UINT64, 453 arrow.INT8, 454 arrow.INT16, 455 arrow.DATE32, 456 arrow.TIME32, 457 arrow.TIME64: 458 data = transferInt(rdr, valueType) 459 case arrow.DATE64: 460 data = transferDate64(rdr, valueType) 461 case arrow.FIXED_SIZE_BINARY, arrow.BINARY, arrow.STRING: 462 return transferBinary(rdr, valueType), nil 463 case arrow.DECIMAL: 464 switch descr.PhysicalType() { 465 case parquet.Types.Int32, parquet.Types.Int64: 466 data = transferDecimalInteger(rdr, valueType) 467 case parquet.Types.ByteArray, parquet.Types.FixedLenByteArray: 468 return transferDecimalBytes(rdr.(file.BinaryRecordReader), valueType) 469 default: 470 return nil, xerrors.New("physical type for decimal128 must be int32, int64, bytearray or fixed len byte array") 471 } 472 case arrow.TIMESTAMP: 473 tstype := valueType.(*arrow.TimestampType) 474 switch tstype.Unit { 475 case arrow.Millisecond, arrow.Microsecond: 476 data = transferZeroCopy(rdr, valueType) 477 case arrow.Nanosecond: 478 if descr.PhysicalType() == parquet.Types.Int96 { 479 data = transferInt96(rdr, valueType) 480 } else { 481 data = transferZeroCopy(rdr, valueType) 482 } 483 default: 484 return nil, xerrors.New("time unit not supported") 485 } 486 default: 487 return nil, xerrors.Errorf("no support for reading columns of type: %s", valueType.Name()) 488 } 489 490 defer data.Release() 491 arr := array.MakeFromData(data) 492 defer arr.Release() 493 return arrow.NewChunked(valueType, []arrow.Array{arr}), nil 494 } 495 496 func transferZeroCopy(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData { 497 bitmap := rdr.ReleaseValidBits() 498 values := rdr.ReleaseValues() 499 defer func() { 500 if bitmap != nil { 501 bitmap.Release() 502 } 503 if values != nil { 504 values.Release() 505 } 506 }() 507 508 return array.NewData(dt, rdr.ValuesWritten(), []*memory.Buffer{ 509 bitmap, values}, nil, int(rdr.NullCount()), 0) 510 } 511 512 func transferBinary(rdr file.RecordReader, dt arrow.DataType) *arrow.Chunked { 513 brdr := rdr.(file.BinaryRecordReader) 514 chunks := brdr.GetBuilderChunks() 515 if dt == arrow.BinaryTypes.String { 516 // convert chunks from binary to string without copying data, 517 // just changing the interpretation of the metadata 518 for idx := range chunks { 519 chunks[idx] = array.MakeFromData(chunks[idx].Data()) 520 defer chunks[idx].Data().Release() 521 defer chunks[idx].Release() 522 } 523 } 524 return arrow.NewChunked(dt, chunks) 525 } 526 527 func transferInt(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData { 528 var ( 529 output reflect.Value 530 ) 531 532 signed := true 533 // create buffer for proper type since parquet only has int32 and int64 534 // physical representations, but we want the correct type representation 535 // for Arrow's in memory buffer. 536 data := make([]byte, rdr.ValuesWritten()*int(bitutil.BytesForBits(int64(dt.(arrow.FixedWidthDataType).BitWidth())))) 537 switch dt.ID() { 538 case arrow.INT8: 539 output = reflect.ValueOf(arrow.Int8Traits.CastFromBytes(data)) 540 case arrow.UINT8: 541 signed = false 542 output = reflect.ValueOf(arrow.Uint8Traits.CastFromBytes(data)) 543 case arrow.INT16: 544 output = reflect.ValueOf(arrow.Int16Traits.CastFromBytes(data)) 545 case arrow.UINT16: 546 signed = false 547 output = reflect.ValueOf(arrow.Uint16Traits.CastFromBytes(data)) 548 case arrow.UINT32: 549 signed = false 550 output = reflect.ValueOf(arrow.Uint32Traits.CastFromBytes(data)) 551 case arrow.UINT64: 552 signed = false 553 output = reflect.ValueOf(arrow.Uint64Traits.CastFromBytes(data)) 554 case arrow.DATE32: 555 output = reflect.ValueOf(arrow.Date32Traits.CastFromBytes(data)) 556 case arrow.TIME32: 557 output = reflect.ValueOf(arrow.Time32Traits.CastFromBytes(data)) 558 case arrow.TIME64: 559 output = reflect.ValueOf(arrow.Time64Traits.CastFromBytes(data)) 560 } 561 562 length := rdr.ValuesWritten() 563 // copy the values semantically with the correct types 564 switch rdr.Type() { 565 case parquet.Types.Int32: 566 values := arrow.Int32Traits.CastFromBytes(rdr.Values()) 567 if signed { 568 for idx, v := range values[:length] { 569 output.Index(idx).SetInt(int64(v)) 570 } 571 } else { 572 for idx, v := range values[:length] { 573 output.Index(idx).SetUint(uint64(v)) 574 } 575 } 576 case parquet.Types.Int64: 577 values := arrow.Int64Traits.CastFromBytes(rdr.Values()) 578 if signed { 579 for idx, v := range values[:length] { 580 output.Index(idx).SetInt(v) 581 } 582 } else { 583 for idx, v := range values[:length] { 584 output.Index(idx).SetUint(uint64(v)) 585 } 586 } 587 } 588 589 bitmap := rdr.ReleaseValidBits() 590 if bitmap != nil { 591 defer bitmap.Release() 592 } 593 594 return array.NewData(dt, rdr.ValuesWritten(), []*memory.Buffer{ 595 bitmap, memory.NewBufferBytes(data), 596 }, nil, int(rdr.NullCount()), 0) 597 } 598 599 func transferBool(rdr file.RecordReader) arrow.ArrayData { 600 // TODO(mtopol): optimize this so we don't convert bitmap to []bool back to bitmap 601 length := rdr.ValuesWritten() 602 data := make([]byte, int(bitutil.BytesForBits(int64(length)))) 603 bytedata := rdr.Values() 604 values := *(*[]bool)(unsafe.Pointer(&bytedata)) 605 606 for idx, v := range values[:length] { 607 if v { 608 bitutil.SetBit(data, idx) 609 } 610 } 611 612 bitmap := rdr.ReleaseValidBits() 613 if bitmap != nil { 614 defer bitmap.Release() 615 } 616 return array.NewData(&arrow.BooleanType{}, length, []*memory.Buffer{ 617 bitmap, memory.NewBufferBytes(data), 618 }, nil, int(rdr.NullCount()), 0) 619 } 620 621 var milliPerDay = time.Duration(24 * time.Hour).Milliseconds() 622 623 // parquet equivalent for date64 is a 32-bit integer of the number of days 624 // since the epoch. Convert each value to milliseconds for date64 625 func transferDate64(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData { 626 length := rdr.ValuesWritten() 627 values := arrow.Int32Traits.CastFromBytes(rdr.Values()) 628 629 data := make([]byte, arrow.Int64Traits.BytesRequired(length)) 630 out := arrow.Int64Traits.CastFromBytes(data) 631 for idx, val := range values[:length] { 632 out[idx] = int64(val) * milliPerDay 633 } 634 635 bitmap := rdr.ReleaseValidBits() 636 if bitmap != nil { 637 defer bitmap.Release() 638 } 639 return array.NewData(dt, length, []*memory.Buffer{ 640 bitmap, memory.NewBufferBytes(data), 641 }, nil, int(rdr.NullCount()), 0) 642 } 643 644 // coerce int96 to nanosecond timestamp 645 func transferInt96(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData { 646 length := rdr.ValuesWritten() 647 values := parquet.Int96Traits.CastFromBytes(rdr.Values()) 648 649 data := make([]byte, arrow.Int64SizeBytes*length) 650 out := arrow.Int64Traits.CastFromBytes(data) 651 652 for idx, val := range values[:length] { 653 if binary.LittleEndian.Uint32(val[8:]) == 0 { 654 out[idx] = 0 655 } else { 656 out[idx] = val.ToTime().UnixNano() 657 } 658 } 659 660 bitmap := rdr.ReleaseValidBits() 661 if bitmap != nil { 662 defer bitmap.Release() 663 } 664 return array.NewData(dt, length, []*memory.Buffer{ 665 bitmap, memory.NewBufferBytes(data), 666 }, nil, int(rdr.NullCount()), 0) 667 } 668 669 // convert physical integer storage of a decimal logical type to a decimal128 typed array 670 func transferDecimalInteger(rdr file.RecordReader, dt arrow.DataType) arrow.ArrayData { 671 length := rdr.ValuesWritten() 672 673 var values reflect.Value 674 switch rdr.Type() { 675 case parquet.Types.Int32: 676 values = reflect.ValueOf(arrow.Int32Traits.CastFromBytes(rdr.Values())[:length]) 677 case parquet.Types.Int64: 678 values = reflect.ValueOf(arrow.Int64Traits.CastFromBytes(rdr.Values())[:length]) 679 } 680 681 data := make([]byte, arrow.Decimal128Traits.BytesRequired(length)) 682 out := arrow.Decimal128Traits.CastFromBytes(data) 683 for i := 0; i < values.Len(); i++ { 684 out[i] = decimal128.FromI64(values.Index(i).Int()) 685 } 686 687 var nullmap *memory.Buffer 688 if rdr.NullCount() > 0 { 689 nullmap = rdr.ReleaseValidBits() 690 defer nullmap.Release() 691 } 692 return array.NewData(dt, length, []*memory.Buffer{ 693 nullmap, memory.NewBufferBytes(data), 694 }, nil, int(rdr.NullCount()), 0) 695 } 696 697 func uint64FromBigEndianShifted(buf []byte) uint64 { 698 var ( 699 bytes [8]byte 700 ) 701 copy(bytes[8-len(buf):], buf) 702 return binary.BigEndian.Uint64(bytes[:]) 703 } 704 705 // parquet's defined encoding for decimal data is for it to be written as big 706 // endian bytes, so convert a bit endian byte order to a decimal128 707 func bigEndianToDecimal128(buf []byte) (decimal128.Num, error) { 708 const ( 709 minDecimalBytes = 1 710 maxDecimalBytes = 16 711 ) 712 713 if len(buf) < minDecimalBytes || len(buf) > maxDecimalBytes { 714 return decimal128.Num{}, xerrors.Errorf("length of byte array passed to bigEndianToDecimal128 was %d but must be between %d and %d", 715 len(buf), minDecimalBytes, maxDecimalBytes) 716 } 717 718 // bytes are big endian so first byte is MSB and holds the sign bit 719 isNeg := int8(buf[0]) < 0 720 721 // 1. extract high bits 722 highBitsOffset := utils.MaxInt(0, len(buf)-8) 723 var ( 724 highBits uint64 725 lowBits uint64 726 hi int64 727 lo int64 728 ) 729 highBits = uint64FromBigEndianShifted(buf[:highBitsOffset]) 730 731 if highBitsOffset == 8 { 732 hi = int64(highBits) 733 } else { 734 if isNeg && len(buf) < maxDecimalBytes { 735 hi = -1 736 } 737 738 hi = int64(uint64(hi) << (uint64(highBitsOffset) * 8)) 739 hi |= int64(highBits) 740 } 741 742 // 2. extract lower bits 743 lowBitsOffset := utils.MinInt(len(buf), 8) 744 lowBits = uint64FromBigEndianShifted(buf[highBitsOffset:]) 745 746 if lowBitsOffset == 8 { 747 lo = int64(lowBits) 748 } else { 749 if isNeg && len(buf) < 8 { 750 lo = -1 751 } 752 753 lo = int64(uint64(lo) << (uint64(lowBitsOffset) * 8)) 754 lo |= int64(lowBits) 755 } 756 757 return decimal128.New(hi, uint64(lo)), nil 758 } 759 760 type varOrFixedBin interface { 761 arrow.Array 762 Value(i int) []byte 763 } 764 765 // convert physical byte storage, instead of integers, to decimal128 766 func transferDecimalBytes(rdr file.BinaryRecordReader, dt arrow.DataType) (*arrow.Chunked, error) { 767 convert := func(arr arrow.Array) (arrow.Array, error) { 768 length := arr.Len() 769 data := make([]byte, arrow.Decimal128Traits.BytesRequired(length)) 770 out := arrow.Decimal128Traits.CastFromBytes(data) 771 772 input := arr.(varOrFixedBin) 773 nullCount := input.NullN() 774 775 var err error 776 for i := 0; i < length; i++ { 777 if nullCount > 0 && input.IsNull(i) { 778 continue 779 } 780 781 rec := input.Value(i) 782 if len(rec) <= 0 { 783 return nil, xerrors.Errorf("invalud BYTEARRAY length for type: %s", dt) 784 } 785 out[i], err = bigEndianToDecimal128(rec) 786 if err != nil { 787 return nil, err 788 } 789 } 790 791 ret := array.NewData(dt, length, []*memory.Buffer{ 792 input.Data().Buffers()[0], memory.NewBufferBytes(data), 793 }, nil, nullCount, 0) 794 defer ret.Release() 795 return array.MakeFromData(ret), nil 796 } 797 798 chunks := rdr.GetBuilderChunks() 799 var err error 800 for idx, chunk := range chunks { 801 defer chunk.Release() 802 if chunks[idx], err = convert(chunk); err != nil { 803 return nil, err 804 } 805 defer chunks[idx].Release() 806 } 807 return arrow.NewChunked(dt, chunks), nil 808 }