github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column_buffer.go (about) 1 package parquet 2 3 import ( 4 "bytes" 5 "encoding/json" 6 "fmt" 7 "io" 8 "math/bits" 9 "reflect" 10 "sort" 11 "time" 12 "unsafe" 13 14 "github.com/parquet-go/parquet-go/deprecated" 15 "github.com/parquet-go/parquet-go/encoding/plain" 16 "github.com/parquet-go/parquet-go/internal/bitpack" 17 "github.com/parquet-go/parquet-go/internal/unsafecast" 18 "github.com/parquet-go/parquet-go/sparse" 19 ) 20 21 // ColumnBuffer is an interface representing columns of a row group. 22 // 23 // ColumnBuffer implements sort.Interface as a way to support reordering the 24 // rows that have been written to it. 25 // 26 // The current implementation has a limitation which prevents applications from 27 // providing custom versions of this interface because it contains unexported 28 // methods. The only way to create ColumnBuffer values is to call the 29 // NewColumnBuffer of Type instances. This limitation may be lifted in future 30 // releases. 31 type ColumnBuffer interface { 32 // Exposes a read-only view of the column buffer. 33 ColumnChunk 34 35 // The column implements ValueReaderAt as a mechanism to read values at 36 // specific locations within the buffer. 37 ValueReaderAt 38 39 // The column implements ValueWriter as a mechanism to optimize the copy 40 // of values into the buffer in contexts where the row information is 41 // provided by the values because the repetition and definition levels 42 // are set. 43 ValueWriter 44 45 // For indexed columns, returns the underlying dictionary holding the column 46 // values. If the column is not indexed, nil is returned. 47 Dictionary() Dictionary 48 49 // Returns a copy of the column. The returned copy shares no memory with 50 // the original, mutations of either column will not modify the other. 51 Clone() ColumnBuffer 52 53 // Returns the column as a Page. 54 Page() Page 55 56 // Clears all rows written to the column. 57 Reset() 58 59 // Returns the current capacity of the column (rows). 60 Cap() int 61 62 // Returns the number of rows currently written to the column. 63 Len() int 64 65 // Compares rows at index i and j and reports whether i < j. 66 Less(i, j int) bool 67 68 // Swaps rows at index i and j. 69 Swap(i, j int) 70 71 // Returns the size of the column buffer in bytes. 72 Size() int64 73 74 // This method is employed to write rows from arrays of Go values into the 75 // column buffer. The method is currently unexported because it uses unsafe 76 // APIs which would be difficult for applications to leverage, increasing 77 // the risk of introducing bugs in the code. As a consequence, applications 78 // cannot use custom implementations of the ColumnBuffer interface since 79 // they cannot declare an unexported method that would match this signature. 80 // It means that in order to create a ColumnBuffer value, programs need to 81 // go through a call to NewColumnBuffer on a Type instance. We make this 82 // trade off for now as it is preferrable to optimize for safety over 83 // extensibility in the public APIs, we might revisit in the future if we 84 // learn about valid use cases for custom column buffer types. 85 writeValues(rows sparse.Array, levels columnLevels) 86 } 87 88 type columnLevels struct { 89 repetitionDepth byte 90 repetitionLevel byte 91 definitionLevel byte 92 } 93 94 func columnIndexOfNullable(base ColumnBuffer, maxDefinitionLevel byte, definitionLevels []byte) (ColumnIndex, error) { 95 index, err := base.ColumnIndex() 96 if err != nil { 97 return nil, err 98 } 99 return &nullableColumnIndex{ 100 ColumnIndex: index, 101 maxDefinitionLevel: maxDefinitionLevel, 102 definitionLevels: definitionLevels, 103 }, nil 104 } 105 106 type nullableColumnIndex struct { 107 ColumnIndex 108 maxDefinitionLevel byte 109 definitionLevels []byte 110 } 111 112 func (index *nullableColumnIndex) NullPage(i int) bool { 113 return index.NullCount(i) == int64(len(index.definitionLevels)) 114 } 115 116 func (index *nullableColumnIndex) NullCount(i int) int64 { 117 return int64(countLevelsNotEqual(index.definitionLevels, index.maxDefinitionLevel)) 118 } 119 120 type nullOrdering func(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool 121 122 func nullsGoFirst(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool { 123 if definitionLevel1 != maxDefinitionLevel { 124 return definitionLevel2 == maxDefinitionLevel 125 } else { 126 return definitionLevel2 == maxDefinitionLevel && column.Less(i, j) 127 } 128 } 129 130 func nullsGoLast(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool { 131 return definitionLevel1 == maxDefinitionLevel && (definitionLevel2 != maxDefinitionLevel || column.Less(i, j)) 132 } 133 134 // reversedColumnBuffer is an adapter of ColumnBuffer which inverses the order 135 // in which rows are ordered when the column gets sorted. 136 // 137 // This type is used when buffers are constructed with sorting columns ordering 138 // values in descending order. 139 type reversedColumnBuffer struct{ ColumnBuffer } 140 141 func (col *reversedColumnBuffer) Less(i, j int) bool { return col.ColumnBuffer.Less(j, i) } 142 143 // optionalColumnBuffer is an implementation of the ColumnBuffer interface used 144 // as a wrapper to an underlying ColumnBuffer to manage the creation of 145 // definition levels. 146 // 147 // Null values are not written to the underlying column; instead, the buffer 148 // tracks offsets of row values in the column, null row values are represented 149 // by the value -1 and a definition level less than the max. 150 // 151 // This column buffer type is used for all leaf columns that have a non-zero 152 // max definition level and a zero repetition level, which may be because the 153 // column or one of its parent(s) are marked optional. 154 type optionalColumnBuffer struct { 155 base ColumnBuffer 156 reordered bool 157 maxDefinitionLevel byte 158 rows []int32 159 sortIndex []int32 160 definitionLevels []byte 161 nullOrdering nullOrdering 162 } 163 164 func newOptionalColumnBuffer(base ColumnBuffer, maxDefinitionLevel byte, nullOrdering nullOrdering) *optionalColumnBuffer { 165 n := base.Cap() 166 return &optionalColumnBuffer{ 167 base: base, 168 maxDefinitionLevel: maxDefinitionLevel, 169 rows: make([]int32, 0, n), 170 definitionLevels: make([]byte, 0, n), 171 nullOrdering: nullOrdering, 172 } 173 } 174 175 func (col *optionalColumnBuffer) Clone() ColumnBuffer { 176 return &optionalColumnBuffer{ 177 base: col.base.Clone(), 178 reordered: col.reordered, 179 maxDefinitionLevel: col.maxDefinitionLevel, 180 rows: append([]int32{}, col.rows...), 181 definitionLevels: append([]byte{}, col.definitionLevels...), 182 nullOrdering: col.nullOrdering, 183 } 184 } 185 186 func (col *optionalColumnBuffer) Type() Type { 187 return col.base.Type() 188 } 189 190 func (col *optionalColumnBuffer) NumValues() int64 { 191 return int64(len(col.definitionLevels)) 192 } 193 194 func (col *optionalColumnBuffer) ColumnIndex() (ColumnIndex, error) { 195 return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) 196 } 197 198 func (col *optionalColumnBuffer) OffsetIndex() (OffsetIndex, error) { 199 return col.base.OffsetIndex() 200 } 201 202 func (col *optionalColumnBuffer) BloomFilter() BloomFilter { 203 return col.base.BloomFilter() 204 } 205 206 func (col *optionalColumnBuffer) Dictionary() Dictionary { 207 return col.base.Dictionary() 208 } 209 210 func (col *optionalColumnBuffer) Column() int { 211 return col.base.Column() 212 } 213 214 func (col *optionalColumnBuffer) Pages() Pages { 215 return onePage(col.Page()) 216 } 217 218 func (col *optionalColumnBuffer) Page() Page { 219 // No need for any cyclic sorting if the rows have not been reordered. 220 // This case is also important because the cyclic sorting modifies the 221 // buffer which makes it unsafe to read the buffer concurrently. 222 if col.reordered { 223 numNulls := countLevelsNotEqual(col.definitionLevels, col.maxDefinitionLevel) 224 numValues := len(col.rows) - numNulls 225 226 if numValues > 0 { 227 if cap(col.sortIndex) < numValues { 228 col.sortIndex = make([]int32, numValues) 229 } 230 sortIndex := col.sortIndex[:numValues] 231 i := 0 232 for _, j := range col.rows { 233 if j >= 0 { 234 sortIndex[j] = int32(i) 235 i++ 236 } 237 } 238 239 // Cyclic sort: O(N) 240 for i := range sortIndex { 241 for j := int(sortIndex[i]); i != j; j = int(sortIndex[i]) { 242 col.base.Swap(i, j) 243 sortIndex[i], sortIndex[j] = sortIndex[j], sortIndex[i] 244 } 245 } 246 } 247 248 i := 0 249 for _, r := range col.rows { 250 if r >= 0 { 251 col.rows[i] = int32(i) 252 i++ 253 } 254 } 255 256 col.reordered = false 257 } 258 259 return newOptionalPage(col.base.Page(), col.maxDefinitionLevel, col.definitionLevels) 260 } 261 262 func (col *optionalColumnBuffer) Reset() { 263 col.base.Reset() 264 col.rows = col.rows[:0] 265 col.definitionLevels = col.definitionLevels[:0] 266 } 267 268 func (col *optionalColumnBuffer) Size() int64 { 269 return int64(4*len(col.rows)+4*len(col.sortIndex)+len(col.definitionLevels)) + col.base.Size() 270 } 271 272 func (col *optionalColumnBuffer) Cap() int { return cap(col.rows) } 273 274 func (col *optionalColumnBuffer) Len() int { return len(col.rows) } 275 276 func (col *optionalColumnBuffer) Less(i, j int) bool { 277 return col.nullOrdering( 278 col.base, 279 int(col.rows[i]), 280 int(col.rows[j]), 281 col.maxDefinitionLevel, 282 col.definitionLevels[i], 283 col.definitionLevels[j], 284 ) 285 } 286 287 func (col *optionalColumnBuffer) Swap(i, j int) { 288 // Because the underlying column does not contain null values, we cannot 289 // swap its values at indexes i and j. We swap the row indexes only, then 290 // reorder the underlying buffer using a cyclic sort when the buffer is 291 // materialized into a page view. 292 col.reordered = true 293 col.rows[i], col.rows[j] = col.rows[j], col.rows[i] 294 col.definitionLevels[i], col.definitionLevels[j] = col.definitionLevels[j], col.definitionLevels[i] 295 } 296 297 func (col *optionalColumnBuffer) WriteValues(values []Value) (n int, err error) { 298 rowIndex := int32(col.base.Len()) 299 300 for n < len(values) { 301 // Collect index range of contiguous null values, from i to n. If this 302 // for loop exhausts the values, all remaining if statements and for 303 // loops will be no-ops and the loop will terminate. 304 i := n 305 for n < len(values) && values[n].definitionLevel != col.maxDefinitionLevel { 306 n++ 307 } 308 309 // Write the contiguous null values up until the first non-null value 310 // obtained in the for loop above. 311 for _, v := range values[i:n] { 312 col.rows = append(col.rows, -1) 313 col.definitionLevels = append(col.definitionLevels, v.definitionLevel) 314 } 315 316 // Collect index range of contiguous non-null values, from i to n. 317 i = n 318 for n < len(values) && values[n].definitionLevel == col.maxDefinitionLevel { 319 n++ 320 } 321 322 // As long as i < n we have non-null values still to write. It is 323 // possible that we just exhausted the input values in which case i == n 324 // and the outer for loop will terminate. 325 if i < n { 326 count, err := col.base.WriteValues(values[i:n]) 327 col.definitionLevels = appendLevel(col.definitionLevels, col.maxDefinitionLevel, count) 328 329 for count > 0 { 330 col.rows = append(col.rows, rowIndex) 331 rowIndex++ 332 count-- 333 } 334 335 if err != nil { 336 return n, err 337 } 338 } 339 } 340 return n, nil 341 } 342 343 func (col *optionalColumnBuffer) writeValues(rows sparse.Array, levels columnLevels) { 344 // The row count is zero when writing an null optional value, in which case 345 // we still need to output a row to the buffer to record the definition 346 // level. 347 if rows.Len() == 0 { 348 col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) 349 col.rows = append(col.rows, -1) 350 return 351 } 352 353 col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, rows.Len()) 354 355 i := len(col.rows) 356 j := len(col.rows) + rows.Len() 357 358 if j <= cap(col.rows) { 359 col.rows = col.rows[:j] 360 } else { 361 tmp := make([]int32, j, 2*j) 362 copy(tmp, col.rows) 363 col.rows = tmp 364 } 365 366 if levels.definitionLevel != col.maxDefinitionLevel { 367 broadcastValueInt32(col.rows[i:], -1) 368 } else { 369 broadcastRangeInt32(col.rows[i:], int32(col.base.Len())) 370 col.base.writeValues(rows, levels) 371 } 372 } 373 374 func (col *optionalColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { 375 length := int64(len(col.definitionLevels)) 376 if offset < 0 { 377 return 0, errRowIndexOutOfBounds(offset, length) 378 } 379 if offset >= length { 380 return 0, io.EOF 381 } 382 if length -= offset; length < int64(len(values)) { 383 values = values[:length] 384 } 385 386 numNulls1 := int64(countLevelsNotEqual(col.definitionLevels[:offset], col.maxDefinitionLevel)) 387 numNulls2 := int64(countLevelsNotEqual(col.definitionLevels[offset:offset+length], col.maxDefinitionLevel)) 388 389 if numNulls2 < length { 390 n, err := col.base.ReadValuesAt(values[:length-numNulls2], offset-numNulls1) 391 if err != nil { 392 return n, err 393 } 394 } 395 396 if numNulls2 > 0 { 397 columnIndex := ^int16(col.Column()) 398 i := numNulls2 - 1 399 j := length - 1 400 definitionLevels := col.definitionLevels[offset : offset+length] 401 maxDefinitionLevel := col.maxDefinitionLevel 402 403 for n := len(definitionLevels) - 1; n >= 0 && j > i; n-- { 404 if definitionLevels[n] != maxDefinitionLevel { 405 values[j] = Value{definitionLevel: definitionLevels[n], columnIndex: columnIndex} 406 } else { 407 values[j] = values[i] 408 i-- 409 } 410 j-- 411 } 412 } 413 414 return int(length), nil 415 } 416 417 // repeatedColumnBuffer is an implementation of the ColumnBuffer interface used 418 // as a wrapper to an underlying ColumnBuffer to manage the creation of 419 // repetition levels, definition levels, and map rows to the region of the 420 // underlying buffer that contains their sequence of values. 421 // 422 // Null values are not written to the underlying column; instead, the buffer 423 // tracks offsets of row values in the column, null row values are represented 424 // by the value -1 and a definition level less than the max. 425 // 426 // This column buffer type is used for all leaf columns that have a non-zero 427 // max repetition level, which may be because the column or one of its parent(s) 428 // are marked repeated. 429 type repeatedColumnBuffer struct { 430 base ColumnBuffer 431 reordered bool 432 maxRepetitionLevel byte 433 maxDefinitionLevel byte 434 rows []offsetMapping 435 repetitionLevels []byte 436 definitionLevels []byte 437 buffer []Value 438 reordering *repeatedColumnBuffer 439 nullOrdering nullOrdering 440 } 441 442 // The offsetMapping type maps the logical offset of rows within the repetition 443 // and definition levels, to the base offsets in the underlying column buffers 444 // where the non-null values have been written. 445 type offsetMapping struct { 446 offset uint32 447 baseOffset uint32 448 } 449 450 func newRepeatedColumnBuffer(base ColumnBuffer, maxRepetitionLevel, maxDefinitionLevel byte, nullOrdering nullOrdering) *repeatedColumnBuffer { 451 n := base.Cap() 452 return &repeatedColumnBuffer{ 453 base: base, 454 maxRepetitionLevel: maxRepetitionLevel, 455 maxDefinitionLevel: maxDefinitionLevel, 456 rows: make([]offsetMapping, 0, n/8), 457 repetitionLevels: make([]byte, 0, n), 458 definitionLevels: make([]byte, 0, n), 459 nullOrdering: nullOrdering, 460 } 461 } 462 463 func (col *repeatedColumnBuffer) Clone() ColumnBuffer { 464 return &repeatedColumnBuffer{ 465 base: col.base.Clone(), 466 reordered: col.reordered, 467 maxRepetitionLevel: col.maxRepetitionLevel, 468 maxDefinitionLevel: col.maxDefinitionLevel, 469 rows: append([]offsetMapping{}, col.rows...), 470 repetitionLevels: append([]byte{}, col.repetitionLevels...), 471 definitionLevels: append([]byte{}, col.definitionLevels...), 472 nullOrdering: col.nullOrdering, 473 } 474 } 475 476 func (col *repeatedColumnBuffer) Type() Type { 477 return col.base.Type() 478 } 479 480 func (col *repeatedColumnBuffer) NumValues() int64 { 481 return int64(len(col.definitionLevels)) 482 } 483 484 func (col *repeatedColumnBuffer) ColumnIndex() (ColumnIndex, error) { 485 return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) 486 } 487 488 func (col *repeatedColumnBuffer) OffsetIndex() (OffsetIndex, error) { 489 return col.base.OffsetIndex() 490 } 491 492 func (col *repeatedColumnBuffer) BloomFilter() BloomFilter { 493 return col.base.BloomFilter() 494 } 495 496 func (col *repeatedColumnBuffer) Dictionary() Dictionary { 497 return col.base.Dictionary() 498 } 499 500 func (col *repeatedColumnBuffer) Column() int { 501 return col.base.Column() 502 } 503 504 func (col *repeatedColumnBuffer) Pages() Pages { 505 return onePage(col.Page()) 506 } 507 508 func (col *repeatedColumnBuffer) Page() Page { 509 if col.reordered { 510 if col.reordering == nil { 511 col.reordering = col.Clone().(*repeatedColumnBuffer) 512 } 513 514 column := col.reordering 515 column.Reset() 516 maxNumValues := 0 517 defer func() { 518 clearValues(col.buffer[:maxNumValues]) 519 }() 520 521 baseOffset := 0 522 523 for _, row := range col.rows { 524 rowOffset := int(row.offset) 525 rowLength := repeatedRowLength(col.repetitionLevels[rowOffset:]) 526 numNulls := countLevelsNotEqual(col.definitionLevels[rowOffset:rowOffset+rowLength], col.maxDefinitionLevel) 527 numValues := rowLength - numNulls 528 529 if numValues > 0 { 530 if numValues > cap(col.buffer) { 531 col.buffer = make([]Value, numValues) 532 } else { 533 col.buffer = col.buffer[:numValues] 534 } 535 n, err := col.base.ReadValuesAt(col.buffer, int64(row.baseOffset)) 536 if err != nil && n < numValues { 537 return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) 538 } 539 if _, err := column.base.WriteValues(col.buffer); err != nil { 540 return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) 541 } 542 if numValues > maxNumValues { 543 maxNumValues = numValues 544 } 545 } 546 547 column.rows = append(column.rows, offsetMapping{ 548 offset: uint32(len(column.repetitionLevels)), 549 baseOffset: uint32(baseOffset), 550 }) 551 552 column.repetitionLevels = append(column.repetitionLevels, col.repetitionLevels[rowOffset:rowOffset+rowLength]...) 553 column.definitionLevels = append(column.definitionLevels, col.definitionLevels[rowOffset:rowOffset+rowLength]...) 554 baseOffset += numValues 555 } 556 557 col.swapReorderingBuffer(column) 558 col.reordered = false 559 } 560 561 return newRepeatedPage( 562 col.base.Page(), 563 col.maxRepetitionLevel, 564 col.maxDefinitionLevel, 565 col.repetitionLevels, 566 col.definitionLevels, 567 ) 568 } 569 570 func (col *repeatedColumnBuffer) swapReorderingBuffer(buf *repeatedColumnBuffer) { 571 col.base, buf.base = buf.base, col.base 572 col.rows, buf.rows = buf.rows, col.rows 573 col.repetitionLevels, buf.repetitionLevels = buf.repetitionLevels, col.repetitionLevels 574 col.definitionLevels, buf.definitionLevels = buf.definitionLevels, col.definitionLevels 575 } 576 577 func (col *repeatedColumnBuffer) Reset() { 578 col.base.Reset() 579 col.rows = col.rows[:0] 580 col.repetitionLevels = col.repetitionLevels[:0] 581 col.definitionLevels = col.definitionLevels[:0] 582 } 583 584 func (col *repeatedColumnBuffer) Size() int64 { 585 return int64(8*len(col.rows)+len(col.repetitionLevels)+len(col.definitionLevels)) + col.base.Size() 586 } 587 588 func (col *repeatedColumnBuffer) Cap() int { return cap(col.rows) } 589 590 func (col *repeatedColumnBuffer) Len() int { return len(col.rows) } 591 592 func (col *repeatedColumnBuffer) Less(i, j int) bool { 593 row1 := col.rows[i] 594 row2 := col.rows[j] 595 less := col.nullOrdering 596 row1Length := repeatedRowLength(col.repetitionLevels[row1.offset:]) 597 row2Length := repeatedRowLength(col.repetitionLevels[row2.offset:]) 598 599 for k := 0; k < row1Length && k < row2Length; k++ { 600 x := int(row1.baseOffset) 601 y := int(row2.baseOffset) 602 definitionLevel1 := col.definitionLevels[int(row1.offset)+k] 603 definitionLevel2 := col.definitionLevels[int(row2.offset)+k] 604 switch { 605 case less(col.base, x, y, col.maxDefinitionLevel, definitionLevel1, definitionLevel2): 606 return true 607 case less(col.base, y, x, col.maxDefinitionLevel, definitionLevel2, definitionLevel1): 608 return false 609 } 610 } 611 612 return row1Length < row2Length 613 } 614 615 func (col *repeatedColumnBuffer) Swap(i, j int) { 616 // Because the underlying column does not contain null values, and may hold 617 // an arbitrary number of values per row, we cannot swap its values at 618 // indexes i and j. We swap the row indexes only, then reorder the base 619 // column buffer when its view is materialized into a page by creating a 620 // copy and writing rows back to it following the order of rows in the 621 // repeated column buffer. 622 col.reordered = true 623 col.rows[i], col.rows[j] = col.rows[j], col.rows[i] 624 } 625 626 func (col *repeatedColumnBuffer) WriteValues(values []Value) (numValues int, err error) { 627 maxRowLen := 0 628 defer func() { 629 clearValues(col.buffer[:maxRowLen]) 630 }() 631 632 for i := 0; i < len(values); { 633 j := i 634 635 if values[j].repetitionLevel == 0 { 636 j++ 637 } 638 639 for j < len(values) && values[j].repetitionLevel != 0 { 640 j++ 641 } 642 643 if err := col.writeRow(values[i:j]); err != nil { 644 return numValues, err 645 } 646 647 if len(col.buffer) > maxRowLen { 648 maxRowLen = len(col.buffer) 649 } 650 651 numValues += j - i 652 i = j 653 } 654 655 return numValues, nil 656 } 657 658 func (col *repeatedColumnBuffer) writeRow(row []Value) error { 659 col.buffer = col.buffer[:0] 660 661 for _, v := range row { 662 if v.definitionLevel == col.maxDefinitionLevel { 663 col.buffer = append(col.buffer, v) 664 } 665 } 666 667 baseOffset := col.base.NumValues() 668 if len(col.buffer) > 0 { 669 if _, err := col.base.WriteValues(col.buffer); err != nil { 670 return err 671 } 672 } 673 674 if row[0].repetitionLevel == 0 { 675 col.rows = append(col.rows, offsetMapping{ 676 offset: uint32(len(col.repetitionLevels)), 677 baseOffset: uint32(baseOffset), 678 }) 679 } 680 681 for _, v := range row { 682 col.repetitionLevels = append(col.repetitionLevels, v.repetitionLevel) 683 col.definitionLevels = append(col.definitionLevels, v.definitionLevel) 684 } 685 686 return nil 687 } 688 689 func (col *repeatedColumnBuffer) writeValues(row sparse.Array, levels columnLevels) { 690 if levels.repetitionLevel == 0 { 691 col.rows = append(col.rows, offsetMapping{ 692 offset: uint32(len(col.repetitionLevels)), 693 baseOffset: uint32(col.base.NumValues()), 694 }) 695 } 696 697 if row.Len() == 0 { 698 col.repetitionLevels = append(col.repetitionLevels, levels.repetitionLevel) 699 col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) 700 return 701 } 702 703 col.repetitionLevels = appendLevel(col.repetitionLevels, levels.repetitionLevel, row.Len()) 704 col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, row.Len()) 705 706 if levels.definitionLevel == col.maxDefinitionLevel { 707 col.base.writeValues(row, levels) 708 } 709 } 710 711 func (col *repeatedColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { 712 // TODO: 713 panic("NOT IMPLEMENTED") 714 } 715 716 // repeatedRowLength gives the length of the repeated row starting at the 717 // beginning of the repetitionLevels slice. 718 func repeatedRowLength(repetitionLevels []byte) int { 719 // If a repetition level exists, at least one value is required to represent 720 // the column. 721 if len(repetitionLevels) > 0 { 722 // The subsequent levels will represent the start of a new record when 723 // they go back to zero. 724 if i := bytes.IndexByte(repetitionLevels[1:], 0); i >= 0 { 725 return i + 1 726 } 727 } 728 return len(repetitionLevels) 729 } 730 731 // ============================================================================= 732 // The types below are in-memory implementations of the ColumnBuffer interface 733 // for each parquet type. 734 // 735 // These column buffers are created by calling NewColumnBuffer on parquet.Type 736 // instances; each parquet type manages to construct column buffers of the 737 // appropriate type, which ensures that we are packing as many values as we 738 // can in memory. 739 // 740 // See Type.NewColumnBuffer for details about how these types get created. 741 // ============================================================================= 742 743 type booleanColumnBuffer struct{ booleanPage } 744 745 func newBooleanColumnBuffer(typ Type, columnIndex int16, numValues int32) *booleanColumnBuffer { 746 // Boolean values are bit-packed, we can fit up to 8 values per byte. 747 bufferSize := (numValues + 7) / 8 748 return &booleanColumnBuffer{ 749 booleanPage: booleanPage{ 750 typ: typ, 751 bits: make([]byte, 0, bufferSize), 752 columnIndex: ^columnIndex, 753 }, 754 } 755 } 756 757 func (col *booleanColumnBuffer) Clone() ColumnBuffer { 758 return &booleanColumnBuffer{ 759 booleanPage: booleanPage{ 760 typ: col.typ, 761 bits: append([]byte{}, col.bits...), 762 offset: col.offset, 763 numValues: col.numValues, 764 columnIndex: col.columnIndex, 765 }, 766 } 767 } 768 769 func (col *booleanColumnBuffer) ColumnIndex() (ColumnIndex, error) { 770 return booleanColumnIndex{&col.booleanPage}, nil 771 } 772 773 func (col *booleanColumnBuffer) OffsetIndex() (OffsetIndex, error) { 774 return booleanOffsetIndex{&col.booleanPage}, nil 775 } 776 777 func (col *booleanColumnBuffer) BloomFilter() BloomFilter { return nil } 778 779 func (col *booleanColumnBuffer) Dictionary() Dictionary { return nil } 780 781 func (col *booleanColumnBuffer) Pages() Pages { return onePage(col.Page()) } 782 783 func (col *booleanColumnBuffer) Page() Page { return &col.booleanPage } 784 785 func (col *booleanColumnBuffer) Reset() { 786 col.bits = col.bits[:0] 787 col.offset = 0 788 col.numValues = 0 789 } 790 791 func (col *booleanColumnBuffer) Cap() int { return 8 * cap(col.bits) } 792 793 func (col *booleanColumnBuffer) Len() int { return int(col.numValues) } 794 795 func (col *booleanColumnBuffer) Less(i, j int) bool { 796 a := col.valueAt(i) 797 b := col.valueAt(j) 798 return a != b && !a 799 } 800 801 func (col *booleanColumnBuffer) valueAt(i int) bool { 802 j := uint32(i) / 8 803 k := uint32(i) % 8 804 return ((col.bits[j] >> k) & 1) != 0 805 } 806 807 func (col *booleanColumnBuffer) setValueAt(i int, v bool) { 808 // `offset` is always zero in the page of a column buffer 809 j := uint32(i) / 8 810 k := uint32(i) % 8 811 x := byte(0) 812 if v { 813 x = 1 814 } 815 col.bits[j] = (col.bits[j] & ^(1 << k)) | (x << k) 816 } 817 818 func (col *booleanColumnBuffer) Swap(i, j int) { 819 a := col.valueAt(i) 820 b := col.valueAt(j) 821 col.setValueAt(i, b) 822 col.setValueAt(j, a) 823 } 824 825 func (col *booleanColumnBuffer) WriteBooleans(values []bool) (int, error) { 826 col.writeValues(sparse.MakeBoolArray(values).UnsafeArray(), columnLevels{}) 827 return len(values), nil 828 } 829 830 func (col *booleanColumnBuffer) WriteValues(values []Value) (int, error) { 831 var model Value 832 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 833 return len(values), nil 834 } 835 836 func (col *booleanColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 837 numBytes := bitpack.ByteCount(uint(col.numValues) + uint(rows.Len())) 838 if cap(col.bits) < numBytes { 839 col.bits = append(make([]byte, 0, max(numBytes, 2*cap(col.bits))), col.bits...) 840 } 841 col.bits = col.bits[:numBytes] 842 i := 0 843 r := 8 - (int(col.numValues) % 8) 844 bytes := rows.Uint8Array() 845 846 if r <= bytes.Len() { 847 // First we attempt to write enough bits to align the number of values 848 // in the column buffer on 8 bytes. After this step the next bit should 849 // be written at the zero'th index of a byte of the buffer. 850 if r < 8 { 851 var b byte 852 for i < r { 853 v := bytes.Index(i) 854 b |= (v & 1) << uint(i) 855 i++ 856 } 857 x := uint(col.numValues) / 8 858 y := uint(col.numValues) % 8 859 col.bits[x] = (b << y) | (col.bits[x] & ^(0xFF << y)) 860 col.numValues += int32(i) 861 } 862 863 if n := ((bytes.Len() - i) / 8) * 8; n > 0 { 864 // At this stage, we know that that we have at least 8 bits to write 865 // and the bits will be aligned on the address of a byte in the 866 // output buffer. We can work on 8 values per loop iteration, 867 // packing them into a single byte and writing it to the output 868 // buffer. This effectively reduces by 87.5% the number of memory 869 // stores that the program needs to perform to generate the values. 870 i += sparse.GatherBits(col.bits[col.numValues/8:], bytes.Slice(i, i+n)) 871 col.numValues += int32(n) 872 } 873 } 874 875 for i < bytes.Len() { 876 x := uint(col.numValues) / 8 877 y := uint(col.numValues) % 8 878 b := bytes.Index(i) 879 col.bits[x] = ((b & 1) << y) | (col.bits[x] & ^(1 << y)) 880 col.numValues++ 881 i++ 882 } 883 884 col.bits = col.bits[:bitpack.ByteCount(uint(col.numValues))] 885 } 886 887 func (col *booleanColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 888 i := int(offset) 889 switch { 890 case i < 0: 891 return 0, errRowIndexOutOfBounds(offset, int64(col.numValues)) 892 case i >= int(col.numValues): 893 return 0, io.EOF 894 default: 895 for n < len(values) && i < int(col.numValues) { 896 values[n] = col.makeValue(col.valueAt(i)) 897 n++ 898 i++ 899 } 900 if n < len(values) { 901 err = io.EOF 902 } 903 return n, err 904 } 905 } 906 907 type int32ColumnBuffer struct{ int32Page } 908 909 func newInt32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int32ColumnBuffer { 910 return &int32ColumnBuffer{ 911 int32Page: int32Page{ 912 typ: typ, 913 values: make([]int32, 0, numValues), 914 columnIndex: ^columnIndex, 915 }, 916 } 917 } 918 919 func (col *int32ColumnBuffer) Clone() ColumnBuffer { 920 return &int32ColumnBuffer{ 921 int32Page: int32Page{ 922 typ: col.typ, 923 values: append([]int32{}, col.values...), 924 columnIndex: col.columnIndex, 925 }, 926 } 927 } 928 929 func (col *int32ColumnBuffer) ColumnIndex() (ColumnIndex, error) { 930 return int32ColumnIndex{&col.int32Page}, nil 931 } 932 933 func (col *int32ColumnBuffer) OffsetIndex() (OffsetIndex, error) { 934 return int32OffsetIndex{&col.int32Page}, nil 935 } 936 937 func (col *int32ColumnBuffer) BloomFilter() BloomFilter { return nil } 938 939 func (col *int32ColumnBuffer) Dictionary() Dictionary { return nil } 940 941 func (col *int32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 942 943 func (col *int32ColumnBuffer) Page() Page { return &col.int32Page } 944 945 func (col *int32ColumnBuffer) Reset() { col.values = col.values[:0] } 946 947 func (col *int32ColumnBuffer) Cap() int { return cap(col.values) } 948 949 func (col *int32ColumnBuffer) Len() int { return len(col.values) } 950 951 func (col *int32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 952 953 func (col *int32ColumnBuffer) Swap(i, j int) { 954 col.values[i], col.values[j] = col.values[j], col.values[i] 955 } 956 957 func (col *int32ColumnBuffer) Write(b []byte) (int, error) { 958 if (len(b) % 4) != 0 { 959 return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) 960 } 961 col.values = append(col.values, unsafecast.BytesToInt32(b)...) 962 return len(b), nil 963 } 964 965 func (col *int32ColumnBuffer) WriteInt32s(values []int32) (int, error) { 966 col.values = append(col.values, values...) 967 return len(values), nil 968 } 969 970 func (col *int32ColumnBuffer) WriteValues(values []Value) (int, error) { 971 var model Value 972 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 973 return len(values), nil 974 } 975 976 func (col *int32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 977 if n := len(col.values) + rows.Len(); n > cap(col.values) { 978 col.values = append(make([]int32, 0, max(n, 2*cap(col.values))), col.values...) 979 } 980 n := len(col.values) 981 col.values = col.values[:n+rows.Len()] 982 sparse.GatherInt32(col.values[n:], rows.Int32Array()) 983 984 } 985 986 func (col *int32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 987 i := int(offset) 988 switch { 989 case i < 0: 990 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 991 case i >= len(col.values): 992 return 0, io.EOF 993 default: 994 for n < len(values) && i < len(col.values) { 995 values[n] = col.makeValue(col.values[i]) 996 n++ 997 i++ 998 } 999 if n < len(values) { 1000 err = io.EOF 1001 } 1002 return n, err 1003 } 1004 } 1005 1006 type int64ColumnBuffer struct{ int64Page } 1007 1008 func newInt64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int64ColumnBuffer { 1009 return &int64ColumnBuffer{ 1010 int64Page: int64Page{ 1011 typ: typ, 1012 values: make([]int64, 0, numValues), 1013 columnIndex: ^columnIndex, 1014 }, 1015 } 1016 } 1017 1018 func (col *int64ColumnBuffer) Clone() ColumnBuffer { 1019 return &int64ColumnBuffer{ 1020 int64Page: int64Page{ 1021 typ: col.typ, 1022 values: append([]int64{}, col.values...), 1023 columnIndex: col.columnIndex, 1024 }, 1025 } 1026 } 1027 1028 func (col *int64ColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1029 return int64ColumnIndex{&col.int64Page}, nil 1030 } 1031 1032 func (col *int64ColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1033 return int64OffsetIndex{&col.int64Page}, nil 1034 } 1035 1036 func (col *int64ColumnBuffer) BloomFilter() BloomFilter { return nil } 1037 1038 func (col *int64ColumnBuffer) Dictionary() Dictionary { return nil } 1039 1040 func (col *int64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1041 1042 func (col *int64ColumnBuffer) Page() Page { return &col.int64Page } 1043 1044 func (col *int64ColumnBuffer) Reset() { col.values = col.values[:0] } 1045 1046 func (col *int64ColumnBuffer) Cap() int { return cap(col.values) } 1047 1048 func (col *int64ColumnBuffer) Len() int { return len(col.values) } 1049 1050 func (col *int64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1051 1052 func (col *int64ColumnBuffer) Swap(i, j int) { 1053 col.values[i], col.values[j] = col.values[j], col.values[i] 1054 } 1055 1056 func (col *int64ColumnBuffer) Write(b []byte) (int, error) { 1057 if (len(b) % 8) != 0 { 1058 return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) 1059 } 1060 col.values = append(col.values, unsafecast.BytesToInt64(b)...) 1061 return len(b), nil 1062 } 1063 1064 func (col *int64ColumnBuffer) WriteInt64s(values []int64) (int, error) { 1065 col.values = append(col.values, values...) 1066 return len(values), nil 1067 } 1068 1069 func (col *int64ColumnBuffer) WriteValues(values []Value) (int, error) { 1070 var model Value 1071 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1072 return len(values), nil 1073 } 1074 1075 func (col *int64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1076 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1077 col.values = append(make([]int64, 0, max(n, 2*cap(col.values))), col.values...) 1078 } 1079 n := len(col.values) 1080 col.values = col.values[:n+rows.Len()] 1081 sparse.GatherInt64(col.values[n:], rows.Int64Array()) 1082 } 1083 1084 func (col *int64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1085 i := int(offset) 1086 switch { 1087 case i < 0: 1088 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1089 case i >= len(col.values): 1090 return 0, io.EOF 1091 default: 1092 for n < len(values) && i < len(col.values) { 1093 values[n] = col.makeValue(col.values[i]) 1094 n++ 1095 i++ 1096 } 1097 if n < len(values) { 1098 err = io.EOF 1099 } 1100 return n, err 1101 } 1102 } 1103 1104 type int96ColumnBuffer struct{ int96Page } 1105 1106 func newInt96ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int96ColumnBuffer { 1107 return &int96ColumnBuffer{ 1108 int96Page: int96Page{ 1109 typ: typ, 1110 values: make([]deprecated.Int96, 0, numValues), 1111 columnIndex: ^columnIndex, 1112 }, 1113 } 1114 } 1115 1116 func (col *int96ColumnBuffer) Clone() ColumnBuffer { 1117 return &int96ColumnBuffer{ 1118 int96Page: int96Page{ 1119 typ: col.typ, 1120 values: append([]deprecated.Int96{}, col.values...), 1121 columnIndex: col.columnIndex, 1122 }, 1123 } 1124 } 1125 1126 func (col *int96ColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1127 return int96ColumnIndex{&col.int96Page}, nil 1128 } 1129 1130 func (col *int96ColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1131 return int96OffsetIndex{&col.int96Page}, nil 1132 } 1133 1134 func (col *int96ColumnBuffer) BloomFilter() BloomFilter { return nil } 1135 1136 func (col *int96ColumnBuffer) Dictionary() Dictionary { return nil } 1137 1138 func (col *int96ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1139 1140 func (col *int96ColumnBuffer) Page() Page { return &col.int96Page } 1141 1142 func (col *int96ColumnBuffer) Reset() { col.values = col.values[:0] } 1143 1144 func (col *int96ColumnBuffer) Cap() int { return cap(col.values) } 1145 1146 func (col *int96ColumnBuffer) Len() int { return len(col.values) } 1147 1148 func (col *int96ColumnBuffer) Less(i, j int) bool { return col.values[i].Less(col.values[j]) } 1149 1150 func (col *int96ColumnBuffer) Swap(i, j int) { 1151 col.values[i], col.values[j] = col.values[j], col.values[i] 1152 } 1153 1154 func (col *int96ColumnBuffer) Write(b []byte) (int, error) { 1155 if (len(b) % 12) != 0 { 1156 return 0, fmt.Errorf("cannot write INT96 values from input of size %d", len(b)) 1157 } 1158 col.values = append(col.values, deprecated.BytesToInt96(b)...) 1159 return len(b), nil 1160 } 1161 1162 func (col *int96ColumnBuffer) WriteInt96s(values []deprecated.Int96) (int, error) { 1163 col.values = append(col.values, values...) 1164 return len(values), nil 1165 } 1166 1167 func (col *int96ColumnBuffer) WriteValues(values []Value) (int, error) { 1168 for _, v := range values { 1169 col.values = append(col.values, v.Int96()) 1170 } 1171 return len(values), nil 1172 } 1173 1174 func (col *int96ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1175 for i := 0; i < rows.Len(); i++ { 1176 p := rows.Index(i) 1177 col.values = append(col.values, *(*deprecated.Int96)(p)) 1178 } 1179 } 1180 1181 func (col *int96ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1182 i := int(offset) 1183 switch { 1184 case i < 0: 1185 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1186 case i >= len(col.values): 1187 return 0, io.EOF 1188 default: 1189 for n < len(values) && i < len(col.values) { 1190 values[n] = col.makeValue(col.values[i]) 1191 n++ 1192 i++ 1193 } 1194 if n < len(values) { 1195 err = io.EOF 1196 } 1197 return n, err 1198 } 1199 } 1200 1201 type floatColumnBuffer struct{ floatPage } 1202 1203 func newFloatColumnBuffer(typ Type, columnIndex int16, numValues int32) *floatColumnBuffer { 1204 return &floatColumnBuffer{ 1205 floatPage: floatPage{ 1206 typ: typ, 1207 values: make([]float32, 0, numValues), 1208 columnIndex: ^columnIndex, 1209 }, 1210 } 1211 } 1212 1213 func (col *floatColumnBuffer) Clone() ColumnBuffer { 1214 return &floatColumnBuffer{ 1215 floatPage: floatPage{ 1216 typ: col.typ, 1217 values: append([]float32{}, col.values...), 1218 columnIndex: col.columnIndex, 1219 }, 1220 } 1221 } 1222 1223 func (col *floatColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1224 return floatColumnIndex{&col.floatPage}, nil 1225 } 1226 1227 func (col *floatColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1228 return floatOffsetIndex{&col.floatPage}, nil 1229 } 1230 1231 func (col *floatColumnBuffer) BloomFilter() BloomFilter { return nil } 1232 1233 func (col *floatColumnBuffer) Dictionary() Dictionary { return nil } 1234 1235 func (col *floatColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1236 1237 func (col *floatColumnBuffer) Page() Page { return &col.floatPage } 1238 1239 func (col *floatColumnBuffer) Reset() { col.values = col.values[:0] } 1240 1241 func (col *floatColumnBuffer) Cap() int { return cap(col.values) } 1242 1243 func (col *floatColumnBuffer) Len() int { return len(col.values) } 1244 1245 func (col *floatColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1246 1247 func (col *floatColumnBuffer) Swap(i, j int) { 1248 col.values[i], col.values[j] = col.values[j], col.values[i] 1249 } 1250 1251 func (col *floatColumnBuffer) Write(b []byte) (int, error) { 1252 if (len(b) % 4) != 0 { 1253 return 0, fmt.Errorf("cannot write FLOAT values from input of size %d", len(b)) 1254 } 1255 col.values = append(col.values, unsafecast.BytesToFloat32(b)...) 1256 return len(b), nil 1257 } 1258 1259 func (col *floatColumnBuffer) WriteFloats(values []float32) (int, error) { 1260 col.values = append(col.values, values...) 1261 return len(values), nil 1262 } 1263 1264 func (col *floatColumnBuffer) WriteValues(values []Value) (int, error) { 1265 var model Value 1266 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1267 return len(values), nil 1268 } 1269 1270 func (col *floatColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1271 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1272 col.values = append(make([]float32, 0, max(n, 2*cap(col.values))), col.values...) 1273 } 1274 n := len(col.values) 1275 col.values = col.values[:n+rows.Len()] 1276 sparse.GatherFloat32(col.values[n:], rows.Float32Array()) 1277 } 1278 1279 func (col *floatColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1280 i := int(offset) 1281 switch { 1282 case i < 0: 1283 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1284 case i >= len(col.values): 1285 return 0, io.EOF 1286 default: 1287 for n < len(values) && i < len(col.values) { 1288 values[n] = col.makeValue(col.values[i]) 1289 n++ 1290 i++ 1291 } 1292 if n < len(values) { 1293 err = io.EOF 1294 } 1295 return n, err 1296 } 1297 } 1298 1299 type doubleColumnBuffer struct{ doublePage } 1300 1301 func newDoubleColumnBuffer(typ Type, columnIndex int16, numValues int32) *doubleColumnBuffer { 1302 return &doubleColumnBuffer{ 1303 doublePage: doublePage{ 1304 typ: typ, 1305 values: make([]float64, 0, numValues), 1306 columnIndex: ^columnIndex, 1307 }, 1308 } 1309 } 1310 1311 func (col *doubleColumnBuffer) Clone() ColumnBuffer { 1312 return &doubleColumnBuffer{ 1313 doublePage: doublePage{ 1314 typ: col.typ, 1315 values: append([]float64{}, col.values...), 1316 columnIndex: col.columnIndex, 1317 }, 1318 } 1319 } 1320 1321 func (col *doubleColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1322 return doubleColumnIndex{&col.doublePage}, nil 1323 } 1324 1325 func (col *doubleColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1326 return doubleOffsetIndex{&col.doublePage}, nil 1327 } 1328 1329 func (col *doubleColumnBuffer) BloomFilter() BloomFilter { return nil } 1330 1331 func (col *doubleColumnBuffer) Dictionary() Dictionary { return nil } 1332 1333 func (col *doubleColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1334 1335 func (col *doubleColumnBuffer) Page() Page { return &col.doublePage } 1336 1337 func (col *doubleColumnBuffer) Reset() { col.values = col.values[:0] } 1338 1339 func (col *doubleColumnBuffer) Cap() int { return cap(col.values) } 1340 1341 func (col *doubleColumnBuffer) Len() int { return len(col.values) } 1342 1343 func (col *doubleColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1344 1345 func (col *doubleColumnBuffer) Swap(i, j int) { 1346 col.values[i], col.values[j] = col.values[j], col.values[i] 1347 } 1348 1349 func (col *doubleColumnBuffer) Write(b []byte) (int, error) { 1350 if (len(b) % 8) != 0 { 1351 return 0, fmt.Errorf("cannot write DOUBLE values from input of size %d", len(b)) 1352 } 1353 col.values = append(col.values, unsafecast.BytesToFloat64(b)...) 1354 return len(b), nil 1355 } 1356 1357 func (col *doubleColumnBuffer) WriteDoubles(values []float64) (int, error) { 1358 col.values = append(col.values, values...) 1359 return len(values), nil 1360 } 1361 1362 func (col *doubleColumnBuffer) WriteValues(values []Value) (int, error) { 1363 var model Value 1364 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1365 return len(values), nil 1366 } 1367 1368 func (col *doubleColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1369 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1370 col.values = append(make([]float64, 0, max(n, 2*cap(col.values))), col.values...) 1371 } 1372 n := len(col.values) 1373 col.values = col.values[:n+rows.Len()] 1374 sparse.GatherFloat64(col.values[n:], rows.Float64Array()) 1375 } 1376 1377 func (col *doubleColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1378 i := int(offset) 1379 switch { 1380 case i < 0: 1381 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1382 case i >= len(col.values): 1383 return 0, io.EOF 1384 default: 1385 for n < len(values) && i < len(col.values) { 1386 values[n] = col.makeValue(col.values[i]) 1387 n++ 1388 i++ 1389 } 1390 if n < len(values) { 1391 err = io.EOF 1392 } 1393 return n, err 1394 } 1395 } 1396 1397 type byteArrayColumnBuffer struct { 1398 byteArrayPage 1399 lengths []uint32 1400 scratch []byte 1401 } 1402 1403 func newByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *byteArrayColumnBuffer { 1404 return &byteArrayColumnBuffer{ 1405 byteArrayPage: byteArrayPage{ 1406 typ: typ, 1407 values: make([]byte, 0, typ.EstimateSize(int(numValues))), 1408 offsets: make([]uint32, 0, numValues+1), 1409 columnIndex: ^columnIndex, 1410 }, 1411 lengths: make([]uint32, 0, numValues), 1412 } 1413 } 1414 1415 func (col *byteArrayColumnBuffer) Clone() ColumnBuffer { 1416 return &byteArrayColumnBuffer{ 1417 byteArrayPage: byteArrayPage{ 1418 typ: col.typ, 1419 values: col.cloneValues(), 1420 offsets: col.cloneOffsets(), 1421 columnIndex: col.columnIndex, 1422 }, 1423 lengths: col.cloneLengths(), 1424 } 1425 } 1426 1427 func (col *byteArrayColumnBuffer) cloneLengths() []uint32 { 1428 lengths := make([]uint32, len(col.lengths)) 1429 copy(lengths, col.lengths) 1430 return lengths 1431 } 1432 1433 func (col *byteArrayColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1434 return byteArrayColumnIndex{&col.byteArrayPage}, nil 1435 } 1436 1437 func (col *byteArrayColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1438 return byteArrayOffsetIndex{&col.byteArrayPage}, nil 1439 } 1440 1441 func (col *byteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } 1442 1443 func (col *byteArrayColumnBuffer) Dictionary() Dictionary { return nil } 1444 1445 func (col *byteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1446 1447 func (col *byteArrayColumnBuffer) Page() Page { 1448 if len(col.lengths) > 0 && orderOfUint32(col.offsets) < 1 { // unordered? 1449 if cap(col.scratch) < len(col.values) { 1450 col.scratch = make([]byte, 0, cap(col.values)) 1451 } else { 1452 col.scratch = col.scratch[:0] 1453 } 1454 1455 for i := range col.lengths { 1456 n := len(col.scratch) 1457 col.scratch = append(col.scratch, col.index(i)...) 1458 col.offsets[i] = uint32(n) 1459 } 1460 1461 col.values, col.scratch = col.scratch, col.values 1462 } 1463 // The offsets have the total length as the last item. Since we are about to 1464 // expose the column buffer's internal state as a Page value we ensure that 1465 // the last offset is the total length of all values. 1466 col.offsets = append(col.offsets[:len(col.lengths)], uint32(len(col.values))) 1467 return &col.byteArrayPage 1468 } 1469 1470 func (col *byteArrayColumnBuffer) Reset() { 1471 col.values = col.values[:0] 1472 col.offsets = col.offsets[:0] 1473 col.lengths = col.lengths[:0] 1474 } 1475 1476 func (col *byteArrayColumnBuffer) NumRows() int64 { return int64(col.Len()) } 1477 1478 func (col *byteArrayColumnBuffer) NumValues() int64 { return int64(col.Len()) } 1479 1480 func (col *byteArrayColumnBuffer) Cap() int { return cap(col.lengths) } 1481 1482 func (col *byteArrayColumnBuffer) Len() int { return len(col.lengths) } 1483 1484 func (col *byteArrayColumnBuffer) Less(i, j int) bool { 1485 return bytes.Compare(col.index(i), col.index(j)) < 0 1486 } 1487 1488 func (col *byteArrayColumnBuffer) Swap(i, j int) { 1489 col.offsets[i], col.offsets[j] = col.offsets[j], col.offsets[i] 1490 col.lengths[i], col.lengths[j] = col.lengths[j], col.lengths[i] 1491 } 1492 1493 func (col *byteArrayColumnBuffer) Write(b []byte) (int, error) { 1494 _, n, err := col.writeByteArrays(b) 1495 return n, err 1496 } 1497 1498 func (col *byteArrayColumnBuffer) WriteByteArrays(values []byte) (int, error) { 1499 n, _, err := col.writeByteArrays(values) 1500 return n, err 1501 } 1502 1503 func (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (count, bytes int, err error) { 1504 baseCount := len(col.lengths) 1505 baseBytes := len(col.values) + (plain.ByteArrayLengthSize * len(col.lengths)) 1506 1507 err = plain.RangeByteArray(values, func(value []byte) error { 1508 col.append(unsafecast.BytesToString(value)) 1509 return nil 1510 }) 1511 1512 count = len(col.lengths) - baseCount 1513 bytes = (len(col.values) - baseBytes) + (plain.ByteArrayLengthSize * count) 1514 return count, bytes, err 1515 } 1516 1517 func (col *byteArrayColumnBuffer) WriteValues(values []Value) (int, error) { 1518 var model Value 1519 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.ptr)), columnLevels{}) 1520 return len(values), nil 1521 } 1522 1523 func (col *byteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1524 for i := 0; i < rows.Len(); i++ { 1525 p := rows.Index(i) 1526 col.append(*(*string)(p)) 1527 } 1528 } 1529 1530 func (col *byteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1531 i := int(offset) 1532 switch { 1533 case i < 0: 1534 return 0, errRowIndexOutOfBounds(offset, int64(len(col.lengths))) 1535 case i >= len(col.lengths): 1536 return 0, io.EOF 1537 default: 1538 for n < len(values) && i < len(col.lengths) { 1539 values[n] = col.makeValueBytes(col.index(i)) 1540 n++ 1541 i++ 1542 } 1543 if n < len(values) { 1544 err = io.EOF 1545 } 1546 return n, err 1547 } 1548 } 1549 1550 func (col *byteArrayColumnBuffer) append(value string) { 1551 col.offsets = append(col.offsets, uint32(len(col.values))) 1552 col.lengths = append(col.lengths, uint32(len(value))) 1553 col.values = append(col.values, value...) 1554 } 1555 1556 func (col *byteArrayColumnBuffer) index(i int) []byte { 1557 offset := col.offsets[i] 1558 length := col.lengths[i] 1559 end := offset + length 1560 return col.values[offset:end:end] 1561 } 1562 1563 type fixedLenByteArrayColumnBuffer struct { 1564 fixedLenByteArrayPage 1565 tmp []byte 1566 } 1567 1568 func newFixedLenByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *fixedLenByteArrayColumnBuffer { 1569 size := typ.Length() 1570 return &fixedLenByteArrayColumnBuffer{ 1571 fixedLenByteArrayPage: fixedLenByteArrayPage{ 1572 typ: typ, 1573 size: size, 1574 data: make([]byte, 0, typ.EstimateSize(int(numValues))), 1575 columnIndex: ^columnIndex, 1576 }, 1577 tmp: make([]byte, size), 1578 } 1579 } 1580 1581 func (col *fixedLenByteArrayColumnBuffer) Clone() ColumnBuffer { 1582 return &fixedLenByteArrayColumnBuffer{ 1583 fixedLenByteArrayPage: fixedLenByteArrayPage{ 1584 typ: col.typ, 1585 size: col.size, 1586 data: append([]byte{}, col.data...), 1587 columnIndex: col.columnIndex, 1588 }, 1589 tmp: make([]byte, col.size), 1590 } 1591 } 1592 1593 func (col *fixedLenByteArrayColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1594 return fixedLenByteArrayColumnIndex{&col.fixedLenByteArrayPage}, nil 1595 } 1596 1597 func (col *fixedLenByteArrayColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1598 return fixedLenByteArrayOffsetIndex{&col.fixedLenByteArrayPage}, nil 1599 } 1600 1601 func (col *fixedLenByteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } 1602 1603 func (col *fixedLenByteArrayColumnBuffer) Dictionary() Dictionary { return nil } 1604 1605 func (col *fixedLenByteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1606 1607 func (col *fixedLenByteArrayColumnBuffer) Page() Page { return &col.fixedLenByteArrayPage } 1608 1609 func (col *fixedLenByteArrayColumnBuffer) Reset() { col.data = col.data[:0] } 1610 1611 func (col *fixedLenByteArrayColumnBuffer) Cap() int { return cap(col.data) / col.size } 1612 1613 func (col *fixedLenByteArrayColumnBuffer) Len() int { return len(col.data) / col.size } 1614 1615 func (col *fixedLenByteArrayColumnBuffer) Less(i, j int) bool { 1616 return bytes.Compare(col.index(i), col.index(j)) < 0 1617 } 1618 1619 func (col *fixedLenByteArrayColumnBuffer) Swap(i, j int) { 1620 t, u, v := col.tmp[:col.size], col.index(i), col.index(j) 1621 copy(t, u) 1622 copy(u, v) 1623 copy(v, t) 1624 } 1625 1626 func (col *fixedLenByteArrayColumnBuffer) index(i int) []byte { 1627 j := (i + 0) * col.size 1628 k := (i + 1) * col.size 1629 return col.data[j:k:k] 1630 } 1631 1632 func (col *fixedLenByteArrayColumnBuffer) Write(b []byte) (int, error) { 1633 n, err := col.WriteFixedLenByteArrays(b) 1634 return n * col.size, err 1635 } 1636 1637 func (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) (int, error) { 1638 d, m := len(values)/col.size, len(values)%col.size 1639 if m != 0 { 1640 return 0, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, len(values)) 1641 } 1642 col.data = append(col.data, values...) 1643 return d, nil 1644 } 1645 1646 func (col *fixedLenByteArrayColumnBuffer) WriteValues(values []Value) (int, error) { 1647 for _, v := range values { 1648 col.data = append(col.data, v.byteArray()...) 1649 } 1650 return len(values), nil 1651 } 1652 1653 func (col *fixedLenByteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1654 n := col.size * rows.Len() 1655 i := len(col.data) 1656 j := len(col.data) + n 1657 1658 if cap(col.data) < j { 1659 col.data = append(make([]byte, 0, max(i+n, 2*cap(col.data))), col.data...) 1660 } 1661 1662 col.data = col.data[:j] 1663 newData := col.data[i:] 1664 1665 for i := 0; i < rows.Len(); i++ { 1666 p := rows.Index(i) 1667 copy(newData[i*col.size:], unsafe.Slice((*byte)(p), col.size)) 1668 } 1669 } 1670 1671 func (col *fixedLenByteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1672 i := int(offset) * col.size 1673 switch { 1674 case i < 0: 1675 return 0, errRowIndexOutOfBounds(offset, int64(len(col.data)/col.size)) 1676 case i >= len(col.data): 1677 return 0, io.EOF 1678 default: 1679 for n < len(values) && i < len(col.data) { 1680 values[n] = col.makeValueBytes(col.data[i : i+col.size]) 1681 n++ 1682 i += col.size 1683 } 1684 if n < len(values) { 1685 err = io.EOF 1686 } 1687 return n, err 1688 } 1689 } 1690 1691 type uint32ColumnBuffer struct{ uint32Page } 1692 1693 func newUint32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint32ColumnBuffer { 1694 return &uint32ColumnBuffer{ 1695 uint32Page: uint32Page{ 1696 typ: typ, 1697 values: make([]uint32, 0, numValues), 1698 columnIndex: ^columnIndex, 1699 }, 1700 } 1701 } 1702 1703 func (col *uint32ColumnBuffer) Clone() ColumnBuffer { 1704 return &uint32ColumnBuffer{ 1705 uint32Page: uint32Page{ 1706 typ: col.typ, 1707 values: append([]uint32{}, col.values...), 1708 columnIndex: col.columnIndex, 1709 }, 1710 } 1711 } 1712 1713 func (col *uint32ColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1714 return uint32ColumnIndex{&col.uint32Page}, nil 1715 } 1716 1717 func (col *uint32ColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1718 return uint32OffsetIndex{&col.uint32Page}, nil 1719 } 1720 1721 func (col *uint32ColumnBuffer) BloomFilter() BloomFilter { return nil } 1722 1723 func (col *uint32ColumnBuffer) Dictionary() Dictionary { return nil } 1724 1725 func (col *uint32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1726 1727 func (col *uint32ColumnBuffer) Page() Page { return &col.uint32Page } 1728 1729 func (col *uint32ColumnBuffer) Reset() { col.values = col.values[:0] } 1730 1731 func (col *uint32ColumnBuffer) Cap() int { return cap(col.values) } 1732 1733 func (col *uint32ColumnBuffer) Len() int { return len(col.values) } 1734 1735 func (col *uint32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1736 1737 func (col *uint32ColumnBuffer) Swap(i, j int) { 1738 col.values[i], col.values[j] = col.values[j], col.values[i] 1739 } 1740 1741 func (col *uint32ColumnBuffer) Write(b []byte) (int, error) { 1742 if (len(b) % 4) != 0 { 1743 return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) 1744 } 1745 col.values = append(col.values, unsafecast.BytesToUint32(b)...) 1746 return len(b), nil 1747 } 1748 1749 func (col *uint32ColumnBuffer) WriteUint32s(values []uint32) (int, error) { 1750 col.values = append(col.values, values...) 1751 return len(values), nil 1752 } 1753 1754 func (col *uint32ColumnBuffer) WriteValues(values []Value) (int, error) { 1755 var model Value 1756 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1757 return len(values), nil 1758 } 1759 1760 func (col *uint32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1761 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1762 col.values = append(make([]uint32, 0, max(n, 2*cap(col.values))), col.values...) 1763 } 1764 n := len(col.values) 1765 col.values = col.values[:n+rows.Len()] 1766 sparse.GatherUint32(col.values[n:], rows.Uint32Array()) 1767 } 1768 1769 func (col *uint32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1770 i := int(offset) 1771 switch { 1772 case i < 0: 1773 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1774 case i >= len(col.values): 1775 return 0, io.EOF 1776 default: 1777 for n < len(values) && i < len(col.values) { 1778 values[n] = col.makeValue(col.values[i]) 1779 n++ 1780 i++ 1781 } 1782 if n < len(values) { 1783 err = io.EOF 1784 } 1785 return n, err 1786 } 1787 } 1788 1789 type uint64ColumnBuffer struct{ uint64Page } 1790 1791 func newUint64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint64ColumnBuffer { 1792 return &uint64ColumnBuffer{ 1793 uint64Page: uint64Page{ 1794 typ: typ, 1795 values: make([]uint64, 0, numValues), 1796 columnIndex: ^columnIndex, 1797 }, 1798 } 1799 } 1800 1801 func (col *uint64ColumnBuffer) Clone() ColumnBuffer { 1802 return &uint64ColumnBuffer{ 1803 uint64Page: uint64Page{ 1804 typ: col.typ, 1805 values: append([]uint64{}, col.values...), 1806 columnIndex: col.columnIndex, 1807 }, 1808 } 1809 } 1810 1811 func (col *uint64ColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1812 return uint64ColumnIndex{&col.uint64Page}, nil 1813 } 1814 1815 func (col *uint64ColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1816 return uint64OffsetIndex{&col.uint64Page}, nil 1817 } 1818 1819 func (col *uint64ColumnBuffer) BloomFilter() BloomFilter { return nil } 1820 1821 func (col *uint64ColumnBuffer) Dictionary() Dictionary { return nil } 1822 1823 func (col *uint64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1824 1825 func (col *uint64ColumnBuffer) Page() Page { return &col.uint64Page } 1826 1827 func (col *uint64ColumnBuffer) Reset() { col.values = col.values[:0] } 1828 1829 func (col *uint64ColumnBuffer) Cap() int { return cap(col.values) } 1830 1831 func (col *uint64ColumnBuffer) Len() int { return len(col.values) } 1832 1833 func (col *uint64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1834 1835 func (col *uint64ColumnBuffer) Swap(i, j int) { 1836 col.values[i], col.values[j] = col.values[j], col.values[i] 1837 } 1838 1839 func (col *uint64ColumnBuffer) Write(b []byte) (int, error) { 1840 if (len(b) % 8) != 0 { 1841 return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) 1842 } 1843 col.values = append(col.values, unsafecast.BytesToUint64(b)...) 1844 return len(b), nil 1845 } 1846 1847 func (col *uint64ColumnBuffer) WriteUint64s(values []uint64) (int, error) { 1848 col.values = append(col.values, values...) 1849 return len(values), nil 1850 } 1851 1852 func (col *uint64ColumnBuffer) WriteValues(values []Value) (int, error) { 1853 var model Value 1854 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1855 return len(values), nil 1856 } 1857 1858 func (col *uint64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1859 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1860 col.values = append(make([]uint64, 0, max(n, 2*cap(col.values))), col.values...) 1861 } 1862 n := len(col.values) 1863 col.values = col.values[:n+rows.Len()] 1864 sparse.GatherUint64(col.values[n:], rows.Uint64Array()) 1865 } 1866 1867 func (col *uint64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1868 i := int(offset) 1869 switch { 1870 case i < 0: 1871 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1872 case i >= len(col.values): 1873 return 0, io.EOF 1874 default: 1875 for n < len(values) && i < len(col.values) { 1876 values[n] = col.makeValue(col.values[i]) 1877 n++ 1878 i++ 1879 } 1880 if n < len(values) { 1881 err = io.EOF 1882 } 1883 return n, err 1884 } 1885 } 1886 1887 type be128ColumnBuffer struct{ be128Page } 1888 1889 func newBE128ColumnBuffer(typ Type, columnIndex int16, numValues int32) *be128ColumnBuffer { 1890 return &be128ColumnBuffer{ 1891 be128Page: be128Page{ 1892 typ: typ, 1893 values: make([][16]byte, 0, numValues), 1894 columnIndex: ^columnIndex, 1895 }, 1896 } 1897 } 1898 1899 func (col *be128ColumnBuffer) Clone() ColumnBuffer { 1900 return &be128ColumnBuffer{ 1901 be128Page: be128Page{ 1902 typ: col.typ, 1903 values: append([][16]byte{}, col.values...), 1904 columnIndex: col.columnIndex, 1905 }, 1906 } 1907 } 1908 1909 func (col *be128ColumnBuffer) ColumnIndex() (ColumnIndex, error) { 1910 return be128ColumnIndex{&col.be128Page}, nil 1911 } 1912 1913 func (col *be128ColumnBuffer) OffsetIndex() (OffsetIndex, error) { 1914 return be128OffsetIndex{&col.be128Page}, nil 1915 } 1916 1917 func (col *be128ColumnBuffer) BloomFilter() BloomFilter { return nil } 1918 1919 func (col *be128ColumnBuffer) Dictionary() Dictionary { return nil } 1920 1921 func (col *be128ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1922 1923 func (col *be128ColumnBuffer) Page() Page { return &col.be128Page } 1924 1925 func (col *be128ColumnBuffer) Reset() { col.values = col.values[:0] } 1926 1927 func (col *be128ColumnBuffer) Cap() int { return cap(col.values) } 1928 1929 func (col *be128ColumnBuffer) Len() int { return len(col.values) } 1930 1931 func (col *be128ColumnBuffer) Less(i, j int) bool { 1932 return lessBE128(&col.values[i], &col.values[j]) 1933 } 1934 1935 func (col *be128ColumnBuffer) Swap(i, j int) { 1936 col.values[i], col.values[j] = col.values[j], col.values[i] 1937 } 1938 1939 func (col *be128ColumnBuffer) WriteValues(values []Value) (int, error) { 1940 if n := len(col.values) + len(values); n > cap(col.values) { 1941 col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) 1942 } 1943 n := len(col.values) 1944 col.values = col.values[:n+len(values)] 1945 newValues := col.values[n:] 1946 for i, v := range values { 1947 copy(newValues[i][:], v.byteArray()) 1948 } 1949 return len(values), nil 1950 } 1951 1952 func (col *be128ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1953 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1954 col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) 1955 } 1956 n := len(col.values) 1957 col.values = col.values[:n+rows.Len()] 1958 sparse.GatherUint128(col.values[n:], rows.Uint128Array()) 1959 } 1960 1961 func (col *be128ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1962 i := int(offset) 1963 switch { 1964 case i < 0: 1965 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1966 case i >= len(col.values): 1967 return 0, io.EOF 1968 default: 1969 for n < len(values) && i < len(col.values) { 1970 values[n] = col.makeValue(&col.values[i]) 1971 n++ 1972 i++ 1973 } 1974 if n < len(values) { 1975 err = io.EOF 1976 } 1977 return n, err 1978 } 1979 } 1980 1981 var ( 1982 _ sort.Interface = (ColumnBuffer)(nil) 1983 _ io.Writer = (*byteArrayColumnBuffer)(nil) 1984 _ io.Writer = (*fixedLenByteArrayColumnBuffer)(nil) 1985 ) 1986 1987 // writeRowsFunc is the type of functions that apply rows to a set of column 1988 // buffers. 1989 // 1990 // - columns is the array of column buffer where the rows are written. 1991 // 1992 // - rows is the array of Go values to write to the column buffers. 1993 // 1994 // - levels is used to track the column index, repetition and definition levels 1995 // of values when writing optional or repeated columns. 1996 type writeRowsFunc func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error 1997 1998 // writeRowsFuncOf generates a writeRowsFunc function for the given Go type and 1999 // parquet schema. The column path indicates the column that the function is 2000 // being generated for in the parquet schema. 2001 func writeRowsFuncOf(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2002 if leaf, exists := schema.Lookup(path...); exists && leaf.Node.Type().LogicalType() != nil && leaf.Node.Type().LogicalType().Json != nil { 2003 return writeRowsFuncOfJSON(t, schema, path) 2004 } 2005 2006 switch t { 2007 case reflect.TypeOf(deprecated.Int96{}): 2008 return writeRowsFuncOfRequired(t, schema, path) 2009 case reflect.TypeOf(time.Time{}): 2010 return writeRowsFuncOfTime(t, schema, path) 2011 } 2012 2013 switch t.Kind() { 2014 case reflect.Bool, 2015 reflect.Int, 2016 reflect.Uint, 2017 reflect.Int32, 2018 reflect.Uint32, 2019 reflect.Int64, 2020 reflect.Uint64, 2021 reflect.Float32, 2022 reflect.Float64, 2023 reflect.String: 2024 return writeRowsFuncOfRequired(t, schema, path) 2025 2026 case reflect.Slice: 2027 if t.Elem().Kind() == reflect.Uint8 { 2028 return writeRowsFuncOfRequired(t, schema, path) 2029 } else { 2030 return writeRowsFuncOfSlice(t, schema, path) 2031 } 2032 2033 case reflect.Array: 2034 if t.Elem().Kind() == reflect.Uint8 { 2035 return writeRowsFuncOfRequired(t, schema, path) 2036 } 2037 2038 case reflect.Pointer: 2039 return writeRowsFuncOfPointer(t, schema, path) 2040 2041 case reflect.Struct: 2042 return writeRowsFuncOfStruct(t, schema, path) 2043 2044 case reflect.Map: 2045 return writeRowsFuncOfMap(t, schema, path) 2046 } 2047 2048 panic("cannot convert Go values of type " + typeNameOf(t) + " to parquet value") 2049 } 2050 2051 func writeRowsFuncOfRequired(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2052 column := schema.mapping.lookup(path) 2053 columnIndex := column.columnIndex 2054 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2055 columns[columnIndex].writeValues(rows, levels) 2056 return nil 2057 } 2058 } 2059 2060 func writeRowsFuncOfOptional(t reflect.Type, schema *Schema, path columnPath, writeRows writeRowsFunc) writeRowsFunc { 2061 nullIndex := nullIndexFuncOf(t) 2062 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2063 if rows.Len() == 0 { 2064 return writeRows(columns, rows, levels) 2065 } 2066 2067 nulls := acquireBitmap(rows.Len()) 2068 defer releaseBitmap(nulls) 2069 nullIndex(nulls.bits, rows) 2070 2071 nullLevels := levels 2072 levels.definitionLevel++ 2073 // In this function, we are dealing with optional values which are 2074 // neither pointers nor slices; for example, a int32 field marked 2075 // "optional" in its parent struct. 2076 // 2077 // We need to find zero values, which should be represented as nulls 2078 // in the parquet column. In order to minimize the calls to writeRows 2079 // and maximize throughput, we use the nullIndex and nonNullIndex 2080 // functions, which are type-specific implementations of the algorithm. 2081 // 2082 // Sections of the input that are contiguous nulls or non-nulls can be 2083 // sent to a single call to writeRows to be written to the underlying 2084 // buffer since they share the same definition level. 2085 // 2086 // This optimization is defeated by inputs alternating null and non-null 2087 // sequences of single values, we do not expect this condition to be a 2088 // common case. 2089 for i := 0; i < rows.Len(); { 2090 j := 0 2091 x := i / 64 2092 y := i % 64 2093 2094 if y != 0 { 2095 if b := nulls.bits[x] >> uint(y); b == 0 { 2096 x++ 2097 y = 0 2098 } else { 2099 y += bits.TrailingZeros64(b) 2100 goto writeNulls 2101 } 2102 } 2103 2104 for x < len(nulls.bits) && nulls.bits[x] == 0 { 2105 x++ 2106 } 2107 2108 if x < len(nulls.bits) { 2109 y = bits.TrailingZeros64(nulls.bits[x]) % 64 2110 } 2111 2112 writeNulls: 2113 if j = x*64 + y; j > rows.Len() { 2114 j = rows.Len() 2115 } 2116 2117 if i < j { 2118 if err := writeRows(columns, rows.Slice(i, j), nullLevels); err != nil { 2119 return err 2120 } 2121 i = j 2122 } 2123 2124 if y != 0 { 2125 if b := nulls.bits[x] >> uint(y); b == (1<<uint64(y))-1 { 2126 x++ 2127 y = 0 2128 } else { 2129 y += bits.TrailingZeros64(^b) 2130 goto writeNonNulls 2131 } 2132 } 2133 2134 for x < len(nulls.bits) && nulls.bits[x] == ^uint64(0) { 2135 x++ 2136 } 2137 2138 if x < len(nulls.bits) { 2139 y = bits.TrailingZeros64(^nulls.bits[x]) % 64 2140 } 2141 2142 writeNonNulls: 2143 if j = x*64 + y; j > rows.Len() { 2144 j = rows.Len() 2145 } 2146 2147 if i < j { 2148 if err := writeRows(columns, rows.Slice(i, j), levels); err != nil { 2149 return err 2150 } 2151 i = j 2152 } 2153 } 2154 2155 return nil 2156 } 2157 } 2158 2159 func writeRowsFuncOfPointer(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2160 elemType := t.Elem() 2161 elemSize := uintptr(elemType.Size()) 2162 writeRows := writeRowsFuncOf(elemType, schema, path) 2163 2164 if len(path) == 0 { 2165 // This code path is taken when generating a writeRowsFunc for a pointer 2166 // type. In this case, we do not need to increase the definition level 2167 // since we are not deailng with an optional field but a pointer to the 2168 // row type. 2169 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2170 if rows.Len() == 0 { 2171 return writeRows(columns, rows, levels) 2172 } 2173 2174 for i := 0; i < rows.Len(); i++ { 2175 p := *(*unsafe.Pointer)(rows.Index(i)) 2176 a := sparse.Array{} 2177 if p != nil { 2178 a = makeArray(p, 1, elemSize) 2179 } 2180 if err := writeRows(columns, a, levels); err != nil { 2181 return err 2182 } 2183 } 2184 2185 return nil 2186 } 2187 } 2188 2189 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2190 if rows.Len() == 0 { 2191 return writeRows(columns, rows, levels) 2192 } 2193 2194 for i := 0; i < rows.Len(); i++ { 2195 p := *(*unsafe.Pointer)(rows.Index(i)) 2196 a := sparse.Array{} 2197 elemLevels := levels 2198 if p != nil { 2199 a = makeArray(p, 1, elemSize) 2200 elemLevels.definitionLevel++ 2201 } 2202 if err := writeRows(columns, a, elemLevels); err != nil { 2203 return err 2204 } 2205 } 2206 2207 return nil 2208 } 2209 } 2210 2211 func writeRowsFuncOfSlice(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2212 elemType := t.Elem() 2213 elemSize := uintptr(elemType.Size()) 2214 writeRows := writeRowsFuncOf(elemType, schema, path) 2215 2216 // When the element is a pointer type, the writeRows function will be an 2217 // instance returned by writeRowsFuncOfPointer, which handles incrementing 2218 // the definition level if the pointer value is not nil. 2219 definitionLevelIncrement := byte(0) 2220 if elemType.Kind() != reflect.Ptr { 2221 definitionLevelIncrement = 1 2222 } 2223 2224 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2225 if rows.Len() == 0 { 2226 return writeRows(columns, rows, levels) 2227 } 2228 2229 levels.repetitionDepth++ 2230 2231 for i := 0; i < rows.Len(); i++ { 2232 p := (*sliceHeader)(rows.Index(i)) 2233 a := makeArray(p.base, p.len, elemSize) 2234 b := sparse.Array{} 2235 2236 elemLevels := levels 2237 if a.Len() > 0 { 2238 b = a.Slice(0, 1) 2239 elemLevels.definitionLevel += definitionLevelIncrement 2240 } 2241 2242 if err := writeRows(columns, b, elemLevels); err != nil { 2243 return err 2244 } 2245 2246 if a.Len() > 1 { 2247 elemLevels.repetitionLevel = elemLevels.repetitionDepth 2248 2249 if err := writeRows(columns, a.Slice(1, a.Len()), elemLevels); err != nil { 2250 return err 2251 } 2252 } 2253 } 2254 2255 return nil 2256 } 2257 } 2258 2259 func writeRowsFuncOfStruct(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2260 type column struct { 2261 offset uintptr 2262 writeRows writeRowsFunc 2263 } 2264 2265 fields := structFieldsOf(t) 2266 columns := make([]column, len(fields)) 2267 2268 for i, f := range fields { 2269 optional := false 2270 columnPath := path.append(f.Name) 2271 forEachStructTagOption(f, func(_ reflect.Type, option, _ string) { 2272 switch option { 2273 case "list": 2274 columnPath = columnPath.append("list", "element") 2275 case "optional": 2276 optional = true 2277 } 2278 }) 2279 2280 writeRows := writeRowsFuncOf(f.Type, schema, columnPath) 2281 if optional { 2282 switch f.Type.Kind() { 2283 case reflect.Pointer, reflect.Slice: 2284 default: 2285 writeRows = writeRowsFuncOfOptional(f.Type, schema, columnPath, writeRows) 2286 } 2287 } 2288 2289 columns[i] = column{ 2290 offset: f.Offset, 2291 writeRows: writeRows, 2292 } 2293 } 2294 2295 return func(buffers []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2296 if rows.Len() == 0 { 2297 for _, column := range columns { 2298 if err := column.writeRows(buffers, rows, levels); err != nil { 2299 return err 2300 } 2301 } 2302 } else { 2303 for _, column := range columns { 2304 if err := column.writeRows(buffers, rows.Offset(column.offset), levels); err != nil { 2305 return err 2306 } 2307 } 2308 } 2309 return nil 2310 } 2311 } 2312 2313 func writeRowsFuncOfMap(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2314 keyPath := path.append("key_value", "key") 2315 keyType := t.Key() 2316 keySize := uintptr(keyType.Size()) 2317 writeKeys := writeRowsFuncOf(keyType, schema, keyPath) 2318 2319 valuePath := path.append("key_value", "value") 2320 valueType := t.Elem() 2321 valueSize := uintptr(valueType.Size()) 2322 writeValues := writeRowsFuncOf(valueType, schema, valuePath) 2323 2324 writeKeyValues := func(columns []ColumnBuffer, keys, values sparse.Array, levels columnLevels) error { 2325 if err := writeKeys(columns, keys, levels); err != nil { 2326 return err 2327 } 2328 if err := writeValues(columns, values, levels); err != nil { 2329 return err 2330 } 2331 return nil 2332 } 2333 2334 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2335 if rows.Len() == 0 { 2336 return writeKeyValues(columns, rows, rows, levels) 2337 } 2338 2339 levels.repetitionDepth++ 2340 mapKey := reflect.New(keyType).Elem() 2341 mapValue := reflect.New(valueType).Elem() 2342 2343 for i := 0; i < rows.Len(); i++ { 2344 m := reflect.NewAt(t, rows.Index(i)).Elem() 2345 2346 if m.Len() == 0 { 2347 empty := sparse.Array{} 2348 if err := writeKeyValues(columns, empty, empty, levels); err != nil { 2349 return err 2350 } 2351 } else { 2352 elemLevels := levels 2353 elemLevels.definitionLevel++ 2354 2355 for it := m.MapRange(); it.Next(); { 2356 mapKey.SetIterKey(it) 2357 mapValue.SetIterValue(it) 2358 2359 k := makeArray(unsafecast.PointerOfValue(mapKey), 1, keySize) 2360 v := makeArray(unsafecast.PointerOfValue(mapValue), 1, valueSize) 2361 2362 if err := writeKeyValues(columns, k, v, elemLevels); err != nil { 2363 return err 2364 } 2365 2366 elemLevels.repetitionLevel = elemLevels.repetitionDepth 2367 } 2368 } 2369 } 2370 2371 return nil 2372 } 2373 } 2374 2375 func writeRowsFuncOfJSON(t reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2376 // If this is a string or a byte array write directly. 2377 switch t.Kind() { 2378 case reflect.String: 2379 return writeRowsFuncOfRequired(t, schema, path) 2380 case reflect.Slice: 2381 if t.Elem().Kind() == reflect.Uint8 { 2382 return writeRowsFuncOfRequired(t, schema, path) 2383 } 2384 } 2385 2386 // Otherwise handle with a json.Marshal 2387 asStrT := reflect.TypeOf(string("")) 2388 writer := writeRowsFuncOfRequired(asStrT, schema, path) 2389 2390 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2391 if rows.Len() == 0 { 2392 return writer(columns, rows, levels) 2393 } 2394 for i := 0; i < rows.Len(); i++ { 2395 val := reflect.NewAt(t, rows.Index(i)) 2396 asI := val.Interface() 2397 2398 b, err := json.Marshal(asI) 2399 if err != nil { 2400 return err 2401 } 2402 2403 asStr := string(b) 2404 a := sparse.MakeStringArray([]string{asStr}) 2405 if err := writer(columns, a.UnsafeArray(), levels); err != nil { 2406 return err 2407 } 2408 } 2409 return nil 2410 } 2411 } 2412 2413 func writeRowsFuncOfTime(_ reflect.Type, schema *Schema, path columnPath) writeRowsFunc { 2414 t := reflect.TypeOf(int64(0)) 2415 elemSize := uintptr(t.Size()) 2416 writeRows := writeRowsFuncOf(t, schema, path) 2417 2418 col, _ := schema.Lookup(path...) 2419 unit := Nanosecond.TimeUnit() 2420 lt := col.Node.Type().LogicalType() 2421 if lt != nil && lt.Timestamp != nil { 2422 unit = lt.Timestamp.Unit 2423 } 2424 2425 return func(columns []ColumnBuffer, rows sparse.Array, levels columnLevels) error { 2426 if rows.Len() == 0 { 2427 return writeRows(columns, rows, levels) 2428 } 2429 2430 times := rows.TimeArray() 2431 for i := 0; i < times.Len(); i++ { 2432 t := times.Index(i) 2433 var val int64 2434 switch { 2435 case unit.Millis != nil: 2436 val = t.UnixMilli() 2437 case unit.Micros != nil: 2438 val = t.UnixMicro() 2439 default: 2440 val = t.UnixNano() 2441 } 2442 2443 a := makeArray(unsafecast.PointerOfValue(reflect.ValueOf(val)), 1, elemSize) 2444 if err := writeRows(columns, a, levels); err != nil { 2445 return err 2446 } 2447 } 2448 2449 return nil 2450 } 2451 }