github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/column_buffer.go (about) 1 package parquet 2 3 import ( 4 "bytes" 5 "fmt" 6 "io" 7 "sort" 8 "unsafe" 9 10 "github.com/vc42/parquet-go/deprecated" 11 "github.com/vc42/parquet-go/encoding/plain" 12 "github.com/vc42/parquet-go/internal/bitpack" 13 "github.com/vc42/parquet-go/internal/unsafecast" 14 "github.com/vc42/parquet-go/sparse" 15 ) 16 17 // ColumnBuffer is an interface representing columns of a row group. 18 // 19 // ColumnBuffer implements sort.Interface as a way to support reordering the 20 // rows that have been written to it. 21 // 22 // The current implementation has a limitation which prevents applications from 23 // providing custom versions of this interface because it contains unexported 24 // methods. The only way to create ColumnBuffer values is to call the 25 // NewColumnBuffer of Type instances. This limitation may be lifted in future 26 // releases. 27 type ColumnBuffer interface { 28 // Exposes a read-only view of the column buffer. 29 ColumnChunk 30 31 // The column implements ValueReaderAt as a mechanism to read values at 32 // specific locations within the buffer. 33 ValueReaderAt 34 35 // The column implements ValueWriter as a mechanism to optimize the copy 36 // of values into the buffer in contexts where the row information is 37 // provided by the values because the repetition and definition levels 38 // are set. 39 ValueWriter 40 41 // For indexed columns, returns the underlying dictionary holding the column 42 // values. If the column is not indexed, nil is returned. 43 Dictionary() Dictionary 44 45 // Returns a copy of the column. The returned copy shares no memory with 46 // the original, mutations of either column will not modify the other. 47 Clone() ColumnBuffer 48 49 // Returns the column as a BufferedPage. 50 Page() BufferedPage 51 52 // Clears all rows written to the column. 53 Reset() 54 55 // Returns the current capacity of the column (rows). 56 Cap() int 57 58 // Returns the number of rows currently written to the column. 59 Len() int 60 61 // Compares rows at index i and j and reports whether i < j. 62 Less(i, j int) bool 63 64 // Swaps rows at index i and j. 65 Swap(i, j int) 66 67 // Returns the size of the column buffer in bytes. 68 Size() int64 69 70 // This method is employed to write rows from arrays of Go values into the 71 // column buffer. The method is currently unexported because it uses unsafe 72 // APIs which would be difficult for applications to leverage, increasing 73 // the risk of introducing bugs in the code. As a consequence, applications 74 // cannot use custom implementations of the ColumnBuffer interface since 75 // they cannot declare an unexported method that would match this signature. 76 // It means that in order to create a ColumnBuffer value, programs need to 77 // go through a call to NewColumnBuffer on a Type instance. We make this 78 // trade off for now as it is preferrable to optimize for safety over 79 // extensibility in the public APIs, we might revisit in the future if we 80 // learn about valid use cases for custom column buffer types. 81 writeValues(rows sparse.Array, levels columnLevels) 82 } 83 84 type columnLevels struct { 85 repetitionDepth byte 86 repetitionLevel byte 87 definitionLevel byte 88 } 89 90 func columnIndexOfNullable(base ColumnBuffer, maxDefinitionLevel byte, definitionLevels []byte) ColumnIndex { 91 return &nullableColumnIndex{ 92 ColumnIndex: base.ColumnIndex(), 93 maxDefinitionLevel: maxDefinitionLevel, 94 definitionLevels: definitionLevels, 95 } 96 } 97 98 type nullableColumnIndex struct { 99 ColumnIndex 100 maxDefinitionLevel byte 101 definitionLevels []byte 102 } 103 104 func (index *nullableColumnIndex) NullPage(i int) bool { 105 return index.NullCount(i) == int64(len(index.definitionLevels)) 106 } 107 108 func (index *nullableColumnIndex) NullCount(i int) int64 { 109 return int64(countLevelsNotEqual(index.definitionLevels, index.maxDefinitionLevel)) 110 } 111 112 type nullOrdering func(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool 113 114 func nullsGoFirst(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool { 115 if definitionLevel1 != maxDefinitionLevel { 116 return definitionLevel2 == maxDefinitionLevel 117 } else { 118 return definitionLevel2 == maxDefinitionLevel && column.Less(i, j) 119 } 120 } 121 122 func nullsGoLast(column ColumnBuffer, i, j int, maxDefinitionLevel, definitionLevel1, definitionLevel2 byte) bool { 123 return definitionLevel1 == maxDefinitionLevel && (definitionLevel2 != maxDefinitionLevel || column.Less(i, j)) 124 } 125 126 // reversedColumnBuffer is an adapter of ColumnBuffer which inverses the order 127 // in which rows are ordered when the column gets sorted. 128 // 129 // This type is used when buffers are constructed with sorting columns ordering 130 // values in descending order. 131 type reversedColumnBuffer struct{ ColumnBuffer } 132 133 func (col *reversedColumnBuffer) Less(i, j int) bool { return col.ColumnBuffer.Less(j, i) } 134 135 // optionalColumnBuffer is an implementation of the ColumnBuffer interface used 136 // as a wrapper to an underlying ColumnBuffer to manage the creation of 137 // definition levels. 138 // 139 // Null values are not written to the underlying column; instead, the buffer 140 // tracks offsets of row values in the column, null row values are represented 141 // by the value -1 and a definition level less than the max. 142 // 143 // This column buffer type is used for all leaf columns that have a non-zero 144 // max definition level and a zero repetition level, which may be because the 145 // column or one of its parent(s) are marked optional. 146 type optionalColumnBuffer struct { 147 base ColumnBuffer 148 reordered bool 149 maxDefinitionLevel byte 150 rows []int32 151 sortIndex []int32 152 definitionLevels []byte 153 nullOrdering nullOrdering 154 } 155 156 func newOptionalColumnBuffer(base ColumnBuffer, maxDefinitionLevel byte, nullOrdering nullOrdering) *optionalColumnBuffer { 157 n := base.Cap() 158 return &optionalColumnBuffer{ 159 base: base, 160 maxDefinitionLevel: maxDefinitionLevel, 161 rows: make([]int32, 0, n), 162 definitionLevels: make([]byte, 0, n), 163 nullOrdering: nullOrdering, 164 } 165 } 166 167 func (col *optionalColumnBuffer) Clone() ColumnBuffer { 168 return &optionalColumnBuffer{ 169 base: col.base.Clone(), 170 reordered: col.reordered, 171 maxDefinitionLevel: col.maxDefinitionLevel, 172 rows: append([]int32{}, col.rows...), 173 definitionLevels: append([]byte{}, col.definitionLevels...), 174 nullOrdering: col.nullOrdering, 175 } 176 } 177 178 func (col *optionalColumnBuffer) Type() Type { 179 return col.base.Type() 180 } 181 182 func (col *optionalColumnBuffer) NumValues() int64 { 183 return int64(len(col.definitionLevels)) 184 } 185 186 func (col *optionalColumnBuffer) ColumnIndex() ColumnIndex { 187 return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) 188 } 189 190 func (col *optionalColumnBuffer) OffsetIndex() OffsetIndex { 191 return col.base.OffsetIndex() 192 } 193 194 func (col *optionalColumnBuffer) BloomFilter() BloomFilter { 195 return col.base.BloomFilter() 196 } 197 198 func (col *optionalColumnBuffer) Dictionary() Dictionary { 199 return col.base.Dictionary() 200 } 201 202 func (col *optionalColumnBuffer) Column() int { 203 return col.base.Column() 204 } 205 206 func (col *optionalColumnBuffer) Pages() Pages { 207 return onePage(col.Page()) 208 } 209 210 func (col *optionalColumnBuffer) Page() BufferedPage { 211 // No need for any cyclic sorting if the rows have not been reordered. 212 // This case is also important because the cyclic sorting modifies the 213 // buffer which makes it unsafe to read the buffer concurrently. 214 if col.reordered { 215 numNulls := countLevelsNotEqual(col.definitionLevels, col.maxDefinitionLevel) 216 numValues := len(col.rows) - numNulls 217 218 if numValues > 0 { 219 if cap(col.sortIndex) < numValues { 220 col.sortIndex = make([]int32, numValues) 221 } 222 sortIndex := col.sortIndex[:numValues] 223 i := 0 224 for _, j := range col.rows { 225 if j >= 0 { 226 sortIndex[j] = int32(i) 227 i++ 228 } 229 } 230 231 // Cyclic sort: O(N) 232 for i := range sortIndex { 233 for j := int(sortIndex[i]); i != j; j = int(sortIndex[i]) { 234 col.base.Swap(i, j) 235 sortIndex[i], sortIndex[j] = sortIndex[j], sortIndex[i] 236 } 237 } 238 } 239 240 i := 0 241 for _, r := range col.rows { 242 if r >= 0 { 243 col.rows[i] = int32(i) 244 i++ 245 } 246 } 247 248 col.reordered = false 249 } 250 251 return newOptionalPage(col.base.Page(), col.maxDefinitionLevel, col.definitionLevels) 252 } 253 254 func (col *optionalColumnBuffer) Reset() { 255 col.base.Reset() 256 col.rows = col.rows[:0] 257 col.definitionLevels = col.definitionLevels[:0] 258 } 259 260 func (col *optionalColumnBuffer) Size() int64 { 261 return int64(4*len(col.rows)+4*len(col.sortIndex)+len(col.definitionLevels)) + col.base.Size() 262 } 263 264 func (col *optionalColumnBuffer) Cap() int { return cap(col.rows) } 265 266 func (col *optionalColumnBuffer) Len() int { return len(col.rows) } 267 268 func (col *optionalColumnBuffer) Less(i, j int) bool { 269 return col.nullOrdering( 270 col.base, 271 int(col.rows[i]), 272 int(col.rows[j]), 273 col.maxDefinitionLevel, 274 col.definitionLevels[i], 275 col.definitionLevels[j], 276 ) 277 } 278 279 func (col *optionalColumnBuffer) Swap(i, j int) { 280 // Because the underlying column does not contain null values, we cannot 281 // swap its values at indexes i and j. We swap the row indexes only, then 282 // reorder the underlying buffer using a cyclic sort when the buffer is 283 // materialized into a page view. 284 col.reordered = true 285 col.rows[i], col.rows[j] = col.rows[j], col.rows[i] 286 col.definitionLevels[i], col.definitionLevels[j] = col.definitionLevels[j], col.definitionLevels[i] 287 } 288 289 func (col *optionalColumnBuffer) WriteValues(values []Value) (n int, err error) { 290 rowIndex := int32(col.base.Len()) 291 292 for n < len(values) { 293 // Collect index range of contiguous null values, from i to n. If this 294 // for loop exhausts the values, all remaining if statements and for 295 // loops will be no-ops and the loop will terminate. 296 i := n 297 for n < len(values) && values[n].definitionLevel != col.maxDefinitionLevel { 298 n++ 299 } 300 301 // Write the contiguous null values up until the first non-null value 302 // obtained in the for loop above. 303 for _, v := range values[i:n] { 304 col.rows = append(col.rows, -1) 305 col.definitionLevels = append(col.definitionLevels, v.definitionLevel) 306 } 307 308 // Collect index range of contiguous non-null values, from i to n. 309 i = n 310 for n < len(values) && values[n].definitionLevel == col.maxDefinitionLevel { 311 n++ 312 } 313 314 // As long as i < n we have non-null values still to write. It is 315 // possible that we just exhausted the input values in which case i == n 316 // and the outer for loop will terminate. 317 if i < n { 318 count, err := col.base.WriteValues(values[i:n]) 319 col.definitionLevels = appendLevel(col.definitionLevels, col.maxDefinitionLevel, count) 320 321 for count > 0 { 322 col.rows = append(col.rows, rowIndex) 323 rowIndex++ 324 count-- 325 } 326 327 if err != nil { 328 return n, err 329 } 330 } 331 } 332 return n, nil 333 } 334 335 func (col *optionalColumnBuffer) writeValues(rows sparse.Array, levels columnLevels) { 336 // The row count is zero when writing an null optional value, in which case 337 // we still need to output a row to the buffer to record the definition 338 // level. 339 if rows.Len() == 0 { 340 col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) 341 col.rows = append(col.rows, -1) 342 return 343 } 344 345 col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, rows.Len()) 346 347 i := len(col.rows) 348 j := len(col.rows) + rows.Len() 349 350 if j <= cap(col.rows) { 351 col.rows = col.rows[:j] 352 } else { 353 tmp := make([]int32, j, 2*j) 354 copy(tmp, col.rows) 355 col.rows = tmp 356 } 357 358 if levels.definitionLevel != col.maxDefinitionLevel { 359 broadcastValueInt32(col.rows[i:], -1) 360 } else { 361 broadcastRangeInt32(col.rows[i:], int32(col.base.Len())) 362 col.base.writeValues(rows, levels) 363 } 364 } 365 366 func (col *optionalColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { 367 length := int64(len(col.definitionLevels)) 368 if offset < 0 { 369 return 0, errRowIndexOutOfBounds(offset, length) 370 } 371 if offset >= length { 372 return 0, io.EOF 373 } 374 if length -= offset; length < int64(len(values)) { 375 values = values[:length] 376 } 377 378 numNulls1 := int64(countLevelsNotEqual(col.definitionLevels[:offset], col.maxDefinitionLevel)) 379 numNulls2 := int64(countLevelsNotEqual(col.definitionLevels[offset:offset+length], col.maxDefinitionLevel)) 380 381 if numNulls2 < length { 382 n, err := col.base.ReadValuesAt(values[:length-numNulls2], offset-numNulls1) 383 if err != nil { 384 return n, err 385 } 386 } 387 388 if numNulls2 > 0 { 389 columnIndex := ^int16(col.Column()) 390 i := numNulls2 - 1 391 j := length - 1 392 definitionLevels := col.definitionLevels[offset : offset+length] 393 maxDefinitionLevel := col.maxDefinitionLevel 394 395 for n := len(definitionLevels) - 1; n >= 0 && j > i; n-- { 396 if definitionLevels[n] != maxDefinitionLevel { 397 values[j] = Value{definitionLevel: definitionLevels[n], columnIndex: columnIndex} 398 } else { 399 values[j] = values[i] 400 i-- 401 } 402 j-- 403 } 404 } 405 406 return int(length), nil 407 } 408 409 // repeatedColumnBuffer is an implementation of the ColumnBuffer interface used 410 // as a wrapper to an underlying ColumnBuffer to manage the creation of 411 // repetition levels, definition levels, and map rows to the region of the 412 // underlying buffer that contains their sequence of values. 413 // 414 // Null values are not written to the underlying column; instead, the buffer 415 // tracks offsets of row values in the column, null row values are represented 416 // by the value -1 and a definition level less than the max. 417 // 418 // This column buffer type is used for all leaf columns that have a non-zero 419 // max repetition level, which may be because the column or one of its parent(s) 420 // are marked repeated. 421 type repeatedColumnBuffer struct { 422 base ColumnBuffer 423 reordered bool 424 maxRepetitionLevel byte 425 maxDefinitionLevel byte 426 rows []region 427 repetitionLevels []byte 428 definitionLevels []byte 429 buffer []Value 430 reordering *repeatedColumnBuffer 431 nullOrdering nullOrdering 432 } 433 434 // The region type maps the logical offset of rows within the repetition and 435 // definition levels, to the base offsets in the underlying column buffers 436 // where the non-null values have been written. 437 type region struct { 438 offset uint32 439 baseOffset uint32 440 } 441 442 func sizeOfRegion(regions []region) int64 { return 8 * int64(len(regions)) } 443 444 func newRepeatedColumnBuffer(base ColumnBuffer, maxRepetitionLevel, maxDefinitionLevel byte, nullOrdering nullOrdering) *repeatedColumnBuffer { 445 n := base.Cap() 446 return &repeatedColumnBuffer{ 447 base: base, 448 maxRepetitionLevel: maxRepetitionLevel, 449 maxDefinitionLevel: maxDefinitionLevel, 450 rows: make([]region, 0, n/8), 451 repetitionLevels: make([]byte, 0, n), 452 definitionLevels: make([]byte, 0, n), 453 nullOrdering: nullOrdering, 454 } 455 } 456 457 func (col *repeatedColumnBuffer) Clone() ColumnBuffer { 458 return &repeatedColumnBuffer{ 459 base: col.base.Clone(), 460 reordered: col.reordered, 461 maxRepetitionLevel: col.maxRepetitionLevel, 462 maxDefinitionLevel: col.maxDefinitionLevel, 463 rows: append([]region{}, col.rows...), 464 repetitionLevels: append([]byte{}, col.repetitionLevels...), 465 definitionLevels: append([]byte{}, col.definitionLevels...), 466 nullOrdering: col.nullOrdering, 467 } 468 } 469 470 func (col *repeatedColumnBuffer) Type() Type { 471 return col.base.Type() 472 } 473 474 func (col *repeatedColumnBuffer) NumValues() int64 { 475 return int64(len(col.definitionLevels)) 476 } 477 478 func (col *repeatedColumnBuffer) ColumnIndex() ColumnIndex { 479 return columnIndexOfNullable(col.base, col.maxDefinitionLevel, col.definitionLevels) 480 } 481 482 func (col *repeatedColumnBuffer) OffsetIndex() OffsetIndex { 483 return col.base.OffsetIndex() 484 } 485 486 func (col *repeatedColumnBuffer) BloomFilter() BloomFilter { 487 return col.base.BloomFilter() 488 } 489 490 func (col *repeatedColumnBuffer) Dictionary() Dictionary { 491 return col.base.Dictionary() 492 } 493 494 func (col *repeatedColumnBuffer) Column() int { 495 return col.base.Column() 496 } 497 498 func (col *repeatedColumnBuffer) Pages() Pages { 499 return onePage(col.Page()) 500 } 501 502 func (col *repeatedColumnBuffer) Page() BufferedPage { 503 if col.reordered { 504 if col.reordering == nil { 505 col.reordering = col.Clone().(*repeatedColumnBuffer) 506 } 507 508 column := col.reordering 509 column.Reset() 510 maxNumValues := 0 511 defer func() { 512 clearValues(col.buffer[:maxNumValues]) 513 }() 514 515 baseOffset := 0 516 517 for _, row := range col.rows { 518 rowOffset := int(row.offset) 519 rowLength := repeatedRowLength(col.repetitionLevels[rowOffset:]) 520 numNulls := countLevelsNotEqual(col.definitionLevels[rowOffset:rowOffset+rowLength], col.maxDefinitionLevel) 521 numValues := rowLength - numNulls 522 523 if numValues > 0 { 524 if numValues > cap(col.buffer) { 525 col.buffer = make([]Value, numValues) 526 } else { 527 col.buffer = col.buffer[:numValues] 528 } 529 n, err := col.base.ReadValuesAt(col.buffer, int64(row.baseOffset)) 530 if err != nil && n < numValues { 531 return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) 532 } 533 if _, err := column.base.WriteValues(col.buffer); err != nil { 534 return newErrorPage(col.Type(), col.Column(), "reordering rows of repeated column: %w", err) 535 } 536 if numValues > maxNumValues { 537 maxNumValues = numValues 538 } 539 } 540 541 column.rows = append(column.rows, region{ 542 offset: uint32(len(column.repetitionLevels)), 543 baseOffset: uint32(baseOffset), 544 }) 545 546 column.repetitionLevels = append(column.repetitionLevels, col.repetitionLevels[rowOffset:rowOffset+rowLength]...) 547 column.definitionLevels = append(column.definitionLevels, col.definitionLevels[rowOffset:rowOffset+rowLength]...) 548 baseOffset += numValues 549 } 550 551 col.swapReorderingBuffer(column) 552 col.reordered = false 553 } 554 555 return newRepeatedPage( 556 col.base.Page(), 557 col.maxRepetitionLevel, 558 col.maxDefinitionLevel, 559 col.repetitionLevels, 560 col.definitionLevels, 561 ) 562 } 563 564 func (col *repeatedColumnBuffer) swapReorderingBuffer(buf *repeatedColumnBuffer) { 565 col.base, buf.base = buf.base, col.base 566 col.rows, buf.rows = buf.rows, col.rows 567 col.repetitionLevels, buf.repetitionLevels = buf.repetitionLevels, col.repetitionLevels 568 col.definitionLevels, buf.definitionLevels = buf.definitionLevels, col.definitionLevels 569 } 570 571 func (col *repeatedColumnBuffer) Reset() { 572 col.base.Reset() 573 col.rows = col.rows[:0] 574 col.repetitionLevels = col.repetitionLevels[:0] 575 col.definitionLevels = col.definitionLevels[:0] 576 } 577 578 func (col *repeatedColumnBuffer) Size() int64 { 579 return sizeOfRegion(col.rows) + int64(len(col.repetitionLevels)) + int64(len(col.definitionLevels)) + col.base.Size() 580 } 581 582 func (col *repeatedColumnBuffer) Cap() int { return cap(col.rows) } 583 584 func (col *repeatedColumnBuffer) Len() int { return len(col.rows) } 585 586 func (col *repeatedColumnBuffer) Less(i, j int) bool { 587 row1 := col.rows[i] 588 row2 := col.rows[j] 589 less := col.nullOrdering 590 row1Length := repeatedRowLength(col.repetitionLevels[row1.offset:]) 591 row2Length := repeatedRowLength(col.repetitionLevels[row2.offset:]) 592 593 for k := 0; k < row1Length && k < row2Length; k++ { 594 x := int(row1.baseOffset) 595 y := int(row2.baseOffset) 596 definitionLevel1 := col.definitionLevels[int(row1.offset)+k] 597 definitionLevel2 := col.definitionLevels[int(row2.offset)+k] 598 switch { 599 case less(col.base, x, y, col.maxDefinitionLevel, definitionLevel1, definitionLevel2): 600 return true 601 case less(col.base, y, x, col.maxDefinitionLevel, definitionLevel2, definitionLevel1): 602 return false 603 } 604 } 605 606 return row1Length < row2Length 607 } 608 609 func (col *repeatedColumnBuffer) Swap(i, j int) { 610 // Because the underlying column does not contain null values, and may hold 611 // an arbitrary number of values per row, we cannot swap its values at 612 // indexes i and j. We swap the row indexes only, then reorder the base 613 // column buffer when its view is materialized into a page by creating a 614 // copy and writing rows back to it following the order of rows in the 615 // repeated column buffer. 616 col.reordered = true 617 col.rows[i], col.rows[j] = col.rows[j], col.rows[i] 618 } 619 620 func (col *repeatedColumnBuffer) WriteValues(values []Value) (numValues int, err error) { 621 maxRowLen := 0 622 defer func() { 623 clearValues(col.buffer[:maxRowLen]) 624 }() 625 626 for i := 0; i < len(values); { 627 j := i 628 629 if values[j].repetitionLevel == 0 { 630 j++ 631 } 632 633 for j < len(values) && values[j].repetitionLevel != 0 { 634 j++ 635 } 636 637 if err := col.writeRow(values[i:j]); err != nil { 638 return numValues, err 639 } 640 641 if len(col.buffer) > maxRowLen { 642 maxRowLen = len(col.buffer) 643 } 644 645 numValues += j - i 646 i = j 647 } 648 649 return numValues, nil 650 } 651 652 func (col *repeatedColumnBuffer) writeRow(row []Value) error { 653 col.buffer = col.buffer[:0] 654 655 for _, v := range row { 656 if v.definitionLevel == col.maxDefinitionLevel { 657 col.buffer = append(col.buffer, v) 658 } 659 } 660 661 baseOffset := col.base.NumValues() 662 if len(col.buffer) > 0 { 663 if _, err := col.base.WriteValues(col.buffer); err != nil { 664 return err 665 } 666 } 667 668 if row[0].repetitionLevel == 0 { 669 col.rows = append(col.rows, region{ 670 offset: uint32(len(col.repetitionLevels)), 671 baseOffset: uint32(baseOffset), 672 }) 673 } 674 675 for _, v := range row { 676 col.repetitionLevels = append(col.repetitionLevels, v.repetitionLevel) 677 col.definitionLevels = append(col.definitionLevels, v.definitionLevel) 678 } 679 680 return nil 681 } 682 683 func (col *repeatedColumnBuffer) writeValues(row sparse.Array, levels columnLevels) { 684 if levels.repetitionLevel == 0 { 685 col.rows = append(col.rows, region{ 686 offset: uint32(len(col.repetitionLevels)), 687 baseOffset: uint32(col.base.NumValues()), 688 }) 689 } 690 691 if row.Len() == 0 { 692 col.repetitionLevels = append(col.repetitionLevels, levels.repetitionLevel) 693 col.definitionLevels = append(col.definitionLevels, levels.definitionLevel) 694 return 695 } 696 697 col.repetitionLevels = appendLevel(col.repetitionLevels, levels.repetitionLevel, row.Len()) 698 col.definitionLevels = appendLevel(col.definitionLevels, levels.definitionLevel, row.Len()) 699 700 if levels.definitionLevel == col.maxDefinitionLevel { 701 col.base.writeValues(row, levels) 702 } 703 } 704 705 func (col *repeatedColumnBuffer) ReadValuesAt(values []Value, offset int64) (int, error) { 706 // TODO: 707 panic("NOT IMPLEMENTED") 708 } 709 710 // ============================================================================= 711 // The types below are in-memory implementations of the ColumnBuffer interface 712 // for each parquet type. 713 // 714 // These column buffers are created by calling NewColumnBuffer on parquet.Type 715 // instances; each parquet type manages to construct column buffers of the 716 // appropriate type, which ensures that we are packing as many values as we 717 // can in memory. 718 // 719 // See Type.NewColumnBuffer for details about how these types get created. 720 // ============================================================================= 721 722 type booleanColumnBuffer struct{ booleanPage } 723 724 func newBooleanColumnBuffer(typ Type, columnIndex int16, numValues int32) *booleanColumnBuffer { 725 // Boolean values are bit-packed, we can fit up to 8 values per byte. 726 bufferSize := (numValues + 7) / 8 727 return &booleanColumnBuffer{ 728 booleanPage: booleanPage{ 729 typ: typ, 730 bits: make([]byte, 0, bufferSize), 731 columnIndex: ^columnIndex, 732 }, 733 } 734 } 735 736 func (col *booleanColumnBuffer) Clone() ColumnBuffer { 737 return &booleanColumnBuffer{ 738 booleanPage: booleanPage{ 739 typ: col.typ, 740 bits: append([]byte{}, col.bits...), 741 offset: col.offset, 742 numValues: col.numValues, 743 columnIndex: col.columnIndex, 744 }, 745 } 746 } 747 748 func (col *booleanColumnBuffer) ColumnIndex() ColumnIndex { 749 return booleanColumnIndex{&col.booleanPage} 750 } 751 752 func (col *booleanColumnBuffer) OffsetIndex() OffsetIndex { 753 return booleanOffsetIndex{&col.booleanPage} 754 } 755 756 func (col *booleanColumnBuffer) BloomFilter() BloomFilter { return nil } 757 758 func (col *booleanColumnBuffer) Dictionary() Dictionary { return nil } 759 760 func (col *booleanColumnBuffer) Pages() Pages { return onePage(col.Page()) } 761 762 func (col *booleanColumnBuffer) Page() BufferedPage { return &col.booleanPage } 763 764 func (col *booleanColumnBuffer) Reset() { 765 col.bits = col.bits[:0] 766 col.offset = 0 767 col.numValues = 0 768 } 769 770 func (col *booleanColumnBuffer) Cap() int { return 8 * cap(col.bits) } 771 772 func (col *booleanColumnBuffer) Len() int { return int(col.numValues) } 773 774 func (col *booleanColumnBuffer) Less(i, j int) bool { 775 a := col.valueAt(i) 776 b := col.valueAt(j) 777 return a != b && !a 778 } 779 780 func (col *booleanColumnBuffer) valueAt(i int) bool { 781 j := uint32(i) / 8 782 k := uint32(i) % 8 783 return ((col.bits[j] >> k) & 1) != 0 784 } 785 786 func (col *booleanColumnBuffer) setValueAt(i int, v bool) { 787 // `offset` is always zero in the page of a column buffer 788 j := uint32(i) / 8 789 k := uint32(i) % 8 790 x := byte(0) 791 if v { 792 x = 1 793 } 794 col.bits[j] = (col.bits[j] & ^(1 << k)) | (x << k) 795 } 796 797 func (col *booleanColumnBuffer) Swap(i, j int) { 798 a := col.valueAt(i) 799 b := col.valueAt(j) 800 col.setValueAt(i, b) 801 col.setValueAt(j, a) 802 } 803 804 func (col *booleanColumnBuffer) WriteBooleans(values []bool) (int, error) { 805 col.writeValues(sparse.MakeBoolArray(values).UnsafeArray(), columnLevels{}) 806 return len(values), nil 807 } 808 809 func (col *booleanColumnBuffer) WriteValues(values []Value) (int, error) { 810 var model Value 811 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 812 return len(values), nil 813 } 814 815 func (col *booleanColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 816 numBytes := bitpack.ByteCount(uint(col.numValues) + uint(rows.Len())) 817 if cap(col.bits) < numBytes { 818 col.bits = append(make([]byte, 0, 2*cap(col.bits)), col.bits...) 819 } 820 col.bits = col.bits[:numBytes] 821 i := 0 822 r := 8 - (int(col.numValues) % 8) 823 bytes := rows.Uint8Array() 824 825 if r <= bytes.Len() { 826 // First we attempt to write enough bits to align the number of values 827 // in the column buffer on 8 bytes. After this step the next bit should 828 // be written at the zero'th index of a byte of the buffer. 829 if r < 8 { 830 var b byte 831 for i < r { 832 v := bytes.Index(i) 833 b |= (v & 1) << uint(i) 834 i++ 835 } 836 x := uint(col.numValues) / 8 837 y := uint(col.numValues) % 8 838 col.bits[x] |= (b << y) | (col.bits[x] & ^(0xFF << y)) 839 col.numValues += int32(i) 840 } 841 842 if n := ((bytes.Len() - i) / 8) * 8; n > 0 { 843 // At this stage, we know that that we have at least 8 bits to write 844 // and the bits will be aligned on the address of a byte in the 845 // output buffer. We can work on 8 values per loop iteration, 846 // packing them into a single byte and writing it to the output 847 // buffer. This effectively reduces by 87.5% the number of memory 848 // stores that the program needs to perform to generate the values. 849 i += sparse.GatherBits(col.bits[col.numValues/8:], bytes.Slice(i, i+n)) 850 col.numValues += int32(n) 851 } 852 } 853 854 for i < bytes.Len() { 855 x := uint(col.numValues) / 8 856 y := uint(col.numValues) % 8 857 b := bytes.Index(i) 858 col.bits[x] = ((b & 1) << y) | (col.bits[x] & ^(1 << y)) 859 col.numValues++ 860 i++ 861 } 862 863 col.bits = col.bits[:bitpack.ByteCount(uint(col.numValues))] 864 } 865 866 func (col *booleanColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 867 i := int(offset) 868 switch { 869 case i < 0: 870 return 0, errRowIndexOutOfBounds(offset, int64(col.numValues)) 871 case i >= int(col.numValues): 872 return 0, io.EOF 873 default: 874 for n < len(values) && i < int(col.numValues) { 875 values[n] = col.makeValue(col.valueAt(i)) 876 n++ 877 i++ 878 } 879 if n < len(values) { 880 err = io.EOF 881 } 882 return n, err 883 } 884 } 885 886 type int32ColumnBuffer struct{ int32Page } 887 888 func newInt32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int32ColumnBuffer { 889 return &int32ColumnBuffer{ 890 int32Page: int32Page{ 891 typ: typ, 892 values: make([]int32, 0, numValues), 893 columnIndex: ^columnIndex, 894 }, 895 } 896 } 897 898 func (col *int32ColumnBuffer) Clone() ColumnBuffer { 899 return &int32ColumnBuffer{ 900 int32Page: int32Page{ 901 typ: col.typ, 902 values: append([]int32{}, col.values...), 903 columnIndex: col.columnIndex, 904 }, 905 } 906 } 907 908 func (col *int32ColumnBuffer) ColumnIndex() ColumnIndex { return int32ColumnIndex{&col.int32Page} } 909 910 func (col *int32ColumnBuffer) OffsetIndex() OffsetIndex { return int32OffsetIndex{&col.int32Page} } 911 912 func (col *int32ColumnBuffer) BloomFilter() BloomFilter { return nil } 913 914 func (col *int32ColumnBuffer) Dictionary() Dictionary { return nil } 915 916 func (col *int32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 917 918 func (col *int32ColumnBuffer) Page() BufferedPage { return &col.int32Page } 919 920 func (col *int32ColumnBuffer) Reset() { col.values = col.values[:0] } 921 922 func (col *int32ColumnBuffer) Cap() int { return cap(col.values) } 923 924 func (col *int32ColumnBuffer) Len() int { return len(col.values) } 925 926 func (col *int32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 927 928 func (col *int32ColumnBuffer) Swap(i, j int) { 929 col.values[i], col.values[j] = col.values[j], col.values[i] 930 } 931 932 func (col *int32ColumnBuffer) Write(b []byte) (int, error) { 933 if (len(b) % 4) != 0 { 934 return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) 935 } 936 col.values = append(col.values, unsafecast.BytesToInt32(b)...) 937 return len(b), nil 938 } 939 940 func (col *int32ColumnBuffer) WriteInt32s(values []int32) (int, error) { 941 col.values = append(col.values, values...) 942 return len(values), nil 943 } 944 945 func (col *int32ColumnBuffer) WriteValues(values []Value) (int, error) { 946 var model Value 947 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 948 return len(values), nil 949 } 950 951 func (col *int32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 952 if n := len(col.values) + rows.Len(); n > cap(col.values) { 953 col.values = append(make([]int32, 0, max(n, 2*cap(col.values))), col.values...) 954 } 955 n := len(col.values) 956 col.values = col.values[:n+rows.Len()] 957 sparse.GatherInt32(col.values[n:], rows.Int32Array()) 958 959 } 960 961 func (col *int32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 962 i := int(offset) 963 switch { 964 case i < 0: 965 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 966 case i >= len(col.values): 967 return 0, io.EOF 968 default: 969 for n < len(values) && i < len(col.values) { 970 values[n] = col.makeValue(col.values[i]) 971 n++ 972 i++ 973 } 974 if n < len(values) { 975 err = io.EOF 976 } 977 return n, err 978 } 979 } 980 981 type int64ColumnBuffer struct{ int64Page } 982 983 func newInt64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int64ColumnBuffer { 984 return &int64ColumnBuffer{ 985 int64Page: int64Page{ 986 typ: typ, 987 values: make([]int64, 0, numValues), 988 columnIndex: ^columnIndex, 989 }, 990 } 991 } 992 993 func (col *int64ColumnBuffer) Clone() ColumnBuffer { 994 return &int64ColumnBuffer{ 995 int64Page: int64Page{ 996 typ: col.typ, 997 values: append([]int64{}, col.values...), 998 columnIndex: col.columnIndex, 999 }, 1000 } 1001 } 1002 1003 func (col *int64ColumnBuffer) ColumnIndex() ColumnIndex { return int64ColumnIndex{&col.int64Page} } 1004 1005 func (col *int64ColumnBuffer) OffsetIndex() OffsetIndex { return int64OffsetIndex{&col.int64Page} } 1006 1007 func (col *int64ColumnBuffer) BloomFilter() BloomFilter { return nil } 1008 1009 func (col *int64ColumnBuffer) Dictionary() Dictionary { return nil } 1010 1011 func (col *int64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1012 1013 func (col *int64ColumnBuffer) Page() BufferedPage { return &col.int64Page } 1014 1015 func (col *int64ColumnBuffer) Reset() { col.values = col.values[:0] } 1016 1017 func (col *int64ColumnBuffer) Cap() int { return cap(col.values) } 1018 1019 func (col *int64ColumnBuffer) Len() int { return len(col.values) } 1020 1021 func (col *int64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1022 1023 func (col *int64ColumnBuffer) Swap(i, j int) { 1024 col.values[i], col.values[j] = col.values[j], col.values[i] 1025 } 1026 1027 func (col *int64ColumnBuffer) Write(b []byte) (int, error) { 1028 if (len(b) % 8) != 0 { 1029 return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) 1030 } 1031 col.values = append(col.values, unsafecast.BytesToInt64(b)...) 1032 return len(b), nil 1033 } 1034 1035 func (col *int64ColumnBuffer) WriteInt64s(values []int64) (int, error) { 1036 col.values = append(col.values, values...) 1037 return len(values), nil 1038 } 1039 1040 func (col *int64ColumnBuffer) WriteValues(values []Value) (int, error) { 1041 var model Value 1042 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1043 return len(values), nil 1044 } 1045 1046 func (col *int64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1047 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1048 col.values = append(make([]int64, 0, max(n, 2*cap(col.values))), col.values...) 1049 } 1050 n := len(col.values) 1051 col.values = col.values[:n+rows.Len()] 1052 sparse.GatherInt64(col.values[n:], rows.Int64Array()) 1053 } 1054 1055 func (col *int64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1056 i := int(offset) 1057 switch { 1058 case i < 0: 1059 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1060 case i >= len(col.values): 1061 return 0, io.EOF 1062 default: 1063 for n < len(values) && i < len(col.values) { 1064 values[n] = col.makeValue(col.values[i]) 1065 n++ 1066 i++ 1067 } 1068 if n < len(values) { 1069 err = io.EOF 1070 } 1071 return n, err 1072 } 1073 } 1074 1075 type int96ColumnBuffer struct{ int96Page } 1076 1077 func newInt96ColumnBuffer(typ Type, columnIndex int16, numValues int32) *int96ColumnBuffer { 1078 return &int96ColumnBuffer{ 1079 int96Page: int96Page{ 1080 typ: typ, 1081 values: make([]deprecated.Int96, 0, numValues), 1082 columnIndex: ^columnIndex, 1083 }, 1084 } 1085 } 1086 1087 func (col *int96ColumnBuffer) Clone() ColumnBuffer { 1088 return &int96ColumnBuffer{ 1089 int96Page: int96Page{ 1090 typ: col.typ, 1091 values: append([]deprecated.Int96{}, col.values...), 1092 columnIndex: col.columnIndex, 1093 }, 1094 } 1095 } 1096 1097 func (col *int96ColumnBuffer) ColumnIndex() ColumnIndex { return int96ColumnIndex{&col.int96Page} } 1098 1099 func (col *int96ColumnBuffer) OffsetIndex() OffsetIndex { return int96OffsetIndex{&col.int96Page} } 1100 1101 func (col *int96ColumnBuffer) BloomFilter() BloomFilter { return nil } 1102 1103 func (col *int96ColumnBuffer) Dictionary() Dictionary { return nil } 1104 1105 func (col *int96ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1106 1107 func (col *int96ColumnBuffer) Page() BufferedPage { return &col.int96Page } 1108 1109 func (col *int96ColumnBuffer) Reset() { col.values = col.values[:0] } 1110 1111 func (col *int96ColumnBuffer) Cap() int { return cap(col.values) } 1112 1113 func (col *int96ColumnBuffer) Len() int { return len(col.values) } 1114 1115 func (col *int96ColumnBuffer) Less(i, j int) bool { return col.values[i].Less(col.values[j]) } 1116 1117 func (col *int96ColumnBuffer) Swap(i, j int) { 1118 col.values[i], col.values[j] = col.values[j], col.values[i] 1119 } 1120 1121 func (col *int96ColumnBuffer) Write(b []byte) (int, error) { 1122 if (len(b) % 12) != 0 { 1123 return 0, fmt.Errorf("cannot write INT96 values from input of size %d", len(b)) 1124 } 1125 col.values = append(col.values, deprecated.BytesToInt96(b)...) 1126 return len(b), nil 1127 } 1128 1129 func (col *int96ColumnBuffer) WriteInt96s(values []deprecated.Int96) (int, error) { 1130 col.values = append(col.values, values...) 1131 return len(values), nil 1132 } 1133 1134 func (col *int96ColumnBuffer) WriteValues(values []Value) (int, error) { 1135 for _, v := range values { 1136 col.values = append(col.values, v.Int96()) 1137 } 1138 return len(values), nil 1139 } 1140 1141 func (col *int96ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1142 for i := 0; i < rows.Len(); i++ { 1143 p := rows.Index(i) 1144 col.values = append(col.values, *(*deprecated.Int96)(p)) 1145 } 1146 } 1147 1148 func (col *int96ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1149 i := int(offset) 1150 switch { 1151 case i < 0: 1152 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1153 case i >= len(col.values): 1154 return 0, io.EOF 1155 default: 1156 for n < len(values) && i < len(col.values) { 1157 values[n] = col.makeValue(col.values[i]) 1158 n++ 1159 i++ 1160 } 1161 if n < len(values) { 1162 err = io.EOF 1163 } 1164 return n, err 1165 } 1166 } 1167 1168 type floatColumnBuffer struct{ floatPage } 1169 1170 func newFloatColumnBuffer(typ Type, columnIndex int16, numValues int32) *floatColumnBuffer { 1171 return &floatColumnBuffer{ 1172 floatPage: floatPage{ 1173 typ: typ, 1174 values: make([]float32, 0, numValues), 1175 columnIndex: ^columnIndex, 1176 }, 1177 } 1178 } 1179 1180 func (col *floatColumnBuffer) Clone() ColumnBuffer { 1181 return &floatColumnBuffer{ 1182 floatPage: floatPage{ 1183 typ: col.typ, 1184 values: append([]float32{}, col.values...), 1185 columnIndex: col.columnIndex, 1186 }, 1187 } 1188 } 1189 1190 func (col *floatColumnBuffer) ColumnIndex() ColumnIndex { return floatColumnIndex{&col.floatPage} } 1191 1192 func (col *floatColumnBuffer) OffsetIndex() OffsetIndex { return floatOffsetIndex{&col.floatPage} } 1193 1194 func (col *floatColumnBuffer) BloomFilter() BloomFilter { return nil } 1195 1196 func (col *floatColumnBuffer) Dictionary() Dictionary { return nil } 1197 1198 func (col *floatColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1199 1200 func (col *floatColumnBuffer) Page() BufferedPage { return &col.floatPage } 1201 1202 func (col *floatColumnBuffer) Reset() { col.values = col.values[:0] } 1203 1204 func (col *floatColumnBuffer) Cap() int { return cap(col.values) } 1205 1206 func (col *floatColumnBuffer) Len() int { return len(col.values) } 1207 1208 func (col *floatColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1209 1210 func (col *floatColumnBuffer) Swap(i, j int) { 1211 col.values[i], col.values[j] = col.values[j], col.values[i] 1212 } 1213 1214 func (col *floatColumnBuffer) Write(b []byte) (int, error) { 1215 if (len(b) % 4) != 0 { 1216 return 0, fmt.Errorf("cannot write FLOAT values from input of size %d", len(b)) 1217 } 1218 col.values = append(col.values, unsafecast.BytesToFloat32(b)...) 1219 return len(b), nil 1220 } 1221 1222 func (col *floatColumnBuffer) WriteFloats(values []float32) (int, error) { 1223 col.values = append(col.values, values...) 1224 return len(values), nil 1225 } 1226 1227 func (col *floatColumnBuffer) WriteValues(values []Value) (int, error) { 1228 var model Value 1229 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1230 return len(values), nil 1231 } 1232 1233 func (col *floatColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1234 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1235 col.values = append(make([]float32, 0, max(n, 2*cap(col.values))), col.values...) 1236 } 1237 n := len(col.values) 1238 col.values = col.values[:n+rows.Len()] 1239 sparse.GatherFloat32(col.values[n:], rows.Float32Array()) 1240 } 1241 1242 func (col *floatColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1243 i := int(offset) 1244 switch { 1245 case i < 0: 1246 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1247 case i >= len(col.values): 1248 return 0, io.EOF 1249 default: 1250 for n < len(values) && i < len(col.values) { 1251 values[n] = col.makeValue(col.values[i]) 1252 n++ 1253 i++ 1254 } 1255 if n < len(values) { 1256 err = io.EOF 1257 } 1258 return n, err 1259 } 1260 } 1261 1262 type doubleColumnBuffer struct{ doublePage } 1263 1264 func newDoubleColumnBuffer(typ Type, columnIndex int16, numValues int32) *doubleColumnBuffer { 1265 return &doubleColumnBuffer{ 1266 doublePage: doublePage{ 1267 typ: typ, 1268 values: make([]float64, 0, numValues), 1269 columnIndex: ^columnIndex, 1270 }, 1271 } 1272 } 1273 1274 func (col *doubleColumnBuffer) Clone() ColumnBuffer { 1275 return &doubleColumnBuffer{ 1276 doublePage: doublePage{ 1277 typ: col.typ, 1278 values: append([]float64{}, col.values...), 1279 columnIndex: col.columnIndex, 1280 }, 1281 } 1282 } 1283 1284 func (col *doubleColumnBuffer) ColumnIndex() ColumnIndex { return doubleColumnIndex{&col.doublePage} } 1285 1286 func (col *doubleColumnBuffer) OffsetIndex() OffsetIndex { return doubleOffsetIndex{&col.doublePage} } 1287 1288 func (col *doubleColumnBuffer) BloomFilter() BloomFilter { return nil } 1289 1290 func (col *doubleColumnBuffer) Dictionary() Dictionary { return nil } 1291 1292 func (col *doubleColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1293 1294 func (col *doubleColumnBuffer) Page() BufferedPage { return &col.doublePage } 1295 1296 func (col *doubleColumnBuffer) Reset() { col.values = col.values[:0] } 1297 1298 func (col *doubleColumnBuffer) Cap() int { return cap(col.values) } 1299 1300 func (col *doubleColumnBuffer) Len() int { return len(col.values) } 1301 1302 func (col *doubleColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1303 1304 func (col *doubleColumnBuffer) Swap(i, j int) { 1305 col.values[i], col.values[j] = col.values[j], col.values[i] 1306 } 1307 1308 func (col *doubleColumnBuffer) Write(b []byte) (int, error) { 1309 if (len(b) % 8) != 0 { 1310 return 0, fmt.Errorf("cannot write DOUBLE values from input of size %d", len(b)) 1311 } 1312 col.values = append(col.values, unsafecast.BytesToFloat64(b)...) 1313 return len(b), nil 1314 } 1315 1316 func (col *doubleColumnBuffer) WriteDoubles(values []float64) (int, error) { 1317 col.values = append(col.values, values...) 1318 return len(values), nil 1319 } 1320 1321 func (col *doubleColumnBuffer) WriteValues(values []Value) (int, error) { 1322 var model Value 1323 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1324 return len(values), nil 1325 } 1326 1327 func (col *doubleColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1328 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1329 col.values = append(make([]float64, 0, max(n, 2*cap(col.values))), col.values...) 1330 } 1331 n := len(col.values) 1332 col.values = col.values[:n+rows.Len()] 1333 sparse.GatherFloat64(col.values[n:], rows.Float64Array()) 1334 } 1335 1336 func (col *doubleColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1337 i := int(offset) 1338 switch { 1339 case i < 0: 1340 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1341 case i >= len(col.values): 1342 return 0, io.EOF 1343 default: 1344 for n < len(values) && i < len(col.values) { 1345 values[n] = col.makeValue(col.values[i]) 1346 n++ 1347 i++ 1348 } 1349 if n < len(values) { 1350 err = io.EOF 1351 } 1352 return n, err 1353 } 1354 } 1355 1356 type byteArrayColumnBuffer struct { 1357 byteArrayPage 1358 offsets []uint32 1359 } 1360 1361 func newByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *byteArrayColumnBuffer { 1362 return &byteArrayColumnBuffer{ 1363 byteArrayPage: byteArrayPage{ 1364 typ: typ, 1365 values: make([]byte, 0, typ.EstimateSize(int(numValues))), 1366 columnIndex: ^columnIndex, 1367 }, 1368 offsets: make([]uint32, 0, numValues), 1369 } 1370 } 1371 1372 func (col *byteArrayColumnBuffer) cloneOffsets() []uint32 { 1373 offsets := make([]uint32, len(col.offsets)) 1374 copy(offsets, col.offsets) 1375 return offsets 1376 } 1377 1378 func (col *byteArrayColumnBuffer) Clone() ColumnBuffer { 1379 return &byteArrayColumnBuffer{ 1380 byteArrayPage: byteArrayPage{ 1381 typ: col.typ, 1382 values: col.cloneValues(), 1383 numValues: col.numValues, 1384 columnIndex: col.columnIndex, 1385 }, 1386 offsets: col.cloneOffsets(), 1387 } 1388 } 1389 1390 func (col *byteArrayColumnBuffer) ColumnIndex() ColumnIndex { 1391 return byteArrayColumnIndex{&col.byteArrayPage} 1392 } 1393 1394 func (col *byteArrayColumnBuffer) OffsetIndex() OffsetIndex { 1395 return byteArrayOffsetIndex{&col.byteArrayPage} 1396 } 1397 1398 func (col *byteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } 1399 1400 func (col *byteArrayColumnBuffer) Dictionary() Dictionary { return nil } 1401 1402 func (col *byteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1403 1404 func (col *byteArrayColumnBuffer) Page() BufferedPage { 1405 if len(col.offsets) > 0 && orderOfUint32(col.offsets) < 1 { // unordered? 1406 values := make([]byte, 0, len(col.values)) // TODO: pool this buffer? 1407 1408 for _, offset := range col.offsets { 1409 values = plain.AppendByteArray(values, col.valueAt(offset)) 1410 } 1411 1412 col.values = values 1413 col.offsets = col.offsets[:0] 1414 1415 for i := 0; i < len(col.values); { 1416 n := plain.ByteArrayLength(col.values[i:]) 1417 col.offsets = append(col.offsets, uint32(i)) 1418 i += plain.ByteArrayLengthSize 1419 i += n 1420 } 1421 } 1422 return &col.byteArrayPage 1423 } 1424 1425 func (col *byteArrayColumnBuffer) Reset() { 1426 col.values = col.values[:0] 1427 col.offsets = col.offsets[:0] 1428 col.numValues = 0 1429 } 1430 1431 func (col *byteArrayColumnBuffer) Cap() int { return cap(col.offsets) } 1432 1433 func (col *byteArrayColumnBuffer) Len() int { return len(col.offsets) } 1434 1435 func (col *byteArrayColumnBuffer) Less(i, j int) bool { 1436 a := col.valueAt(col.offsets[i]) 1437 b := col.valueAt(col.offsets[j]) 1438 return bytes.Compare(a, b) < 0 1439 } 1440 1441 func (col *byteArrayColumnBuffer) Swap(i, j int) { 1442 col.offsets[i], col.offsets[j] = col.offsets[j], col.offsets[i] 1443 } 1444 1445 func (col *byteArrayColumnBuffer) Write(b []byte) (int, error) { 1446 _, n, err := col.writeByteArrays(b) 1447 return n, err 1448 } 1449 1450 func (col *byteArrayColumnBuffer) WriteByteArrays(values []byte) (int, error) { 1451 n, _, err := col.writeByteArrays(values) 1452 return n, err 1453 } 1454 1455 func (col *byteArrayColumnBuffer) writeByteArrays(values []byte) (count, bytes int, err error) { 1456 baseCount, baseBytes := len(col.offsets), len(col.values) 1457 1458 err = plain.RangeByteArray(values, func(value []byte) error { 1459 col.append(unsafecast.BytesToString(value)) 1460 return nil 1461 }) 1462 1463 return len(col.offsets) - baseCount, len(col.values) - baseBytes, err 1464 } 1465 1466 func (col *byteArrayColumnBuffer) WriteValues(values []Value) (int, error) { 1467 var model Value 1468 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.ptr)), columnLevels{}) 1469 return len(values), nil 1470 } 1471 1472 func (col *byteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1473 for i := 0; i < rows.Len(); i++ { 1474 p := rows.Index(i) 1475 col.append(*(*string)(p)) 1476 } 1477 } 1478 1479 func (col *byteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1480 i := int(offset) 1481 switch { 1482 case i < 0: 1483 return 0, errRowIndexOutOfBounds(offset, int64(len(col.offsets))) 1484 case i >= len(col.offsets): 1485 return 0, io.EOF 1486 default: 1487 for n < len(values) && i < len(col.offsets) { 1488 values[n] = col.makeValueBytes(col.valueAt(col.offsets[i])) 1489 n++ 1490 i++ 1491 } 1492 if n < len(values) { 1493 err = io.EOF 1494 } 1495 return n, err 1496 } 1497 } 1498 1499 func (col *byteArrayColumnBuffer) append(value string) { 1500 col.offsets = append(col.offsets, uint32(len(col.values))) 1501 col.values = plain.AppendByteArrayString(col.values, value) 1502 col.numValues++ 1503 } 1504 1505 type fixedLenByteArrayColumnBuffer struct { 1506 fixedLenByteArrayPage 1507 tmp []byte 1508 } 1509 1510 func newFixedLenByteArrayColumnBuffer(typ Type, columnIndex int16, numValues int32) *fixedLenByteArrayColumnBuffer { 1511 size := typ.Length() 1512 return &fixedLenByteArrayColumnBuffer{ 1513 fixedLenByteArrayPage: fixedLenByteArrayPage{ 1514 typ: typ, 1515 size: size, 1516 data: make([]byte, 0, typ.EstimateSize(int(numValues))), 1517 columnIndex: ^columnIndex, 1518 }, 1519 tmp: make([]byte, size), 1520 } 1521 } 1522 1523 func (col *fixedLenByteArrayColumnBuffer) Clone() ColumnBuffer { 1524 return &fixedLenByteArrayColumnBuffer{ 1525 fixedLenByteArrayPage: fixedLenByteArrayPage{ 1526 typ: col.typ, 1527 size: col.size, 1528 data: append([]byte{}, col.data...), 1529 columnIndex: col.columnIndex, 1530 }, 1531 tmp: make([]byte, col.size), 1532 } 1533 } 1534 1535 func (col *fixedLenByteArrayColumnBuffer) ColumnIndex() ColumnIndex { 1536 return fixedLenByteArrayColumnIndex{&col.fixedLenByteArrayPage} 1537 } 1538 1539 func (col *fixedLenByteArrayColumnBuffer) OffsetIndex() OffsetIndex { 1540 return fixedLenByteArrayOffsetIndex{&col.fixedLenByteArrayPage} 1541 } 1542 1543 func (col *fixedLenByteArrayColumnBuffer) BloomFilter() BloomFilter { return nil } 1544 1545 func (col *fixedLenByteArrayColumnBuffer) Dictionary() Dictionary { return nil } 1546 1547 func (col *fixedLenByteArrayColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1548 1549 func (col *fixedLenByteArrayColumnBuffer) Page() BufferedPage { return &col.fixedLenByteArrayPage } 1550 1551 func (col *fixedLenByteArrayColumnBuffer) Reset() { col.data = col.data[:0] } 1552 1553 func (col *fixedLenByteArrayColumnBuffer) Cap() int { return cap(col.data) / col.size } 1554 1555 func (col *fixedLenByteArrayColumnBuffer) Len() int { return len(col.data) / col.size } 1556 1557 func (col *fixedLenByteArrayColumnBuffer) Less(i, j int) bool { 1558 return bytes.Compare(col.index(i), col.index(j)) < 0 1559 } 1560 1561 func (col *fixedLenByteArrayColumnBuffer) Swap(i, j int) { 1562 t, u, v := col.tmp[:col.size], col.index(i), col.index(j) 1563 copy(t, u) 1564 copy(u, v) 1565 copy(v, t) 1566 } 1567 1568 func (col *fixedLenByteArrayColumnBuffer) index(i int) []byte { 1569 j := (i + 0) * col.size 1570 k := (i + 1) * col.size 1571 return col.data[j:k:k] 1572 } 1573 1574 func (col *fixedLenByteArrayColumnBuffer) Write(b []byte) (int, error) { 1575 n, err := col.WriteFixedLenByteArrays(b) 1576 return n * col.size, err 1577 } 1578 1579 func (col *fixedLenByteArrayColumnBuffer) WriteFixedLenByteArrays(values []byte) (int, error) { 1580 d, m := len(values)/col.size, len(values)%col.size 1581 if m != 0 { 1582 return 0, fmt.Errorf("cannot write FIXED_LEN_BYTE_ARRAY values of size %d from input of size %d", col.size, len(values)) 1583 } 1584 col.data = append(col.data, values...) 1585 return d, nil 1586 } 1587 1588 func (col *fixedLenByteArrayColumnBuffer) WriteValues(values []Value) (int, error) { 1589 for _, v := range values { 1590 col.data = append(col.data, v.ByteArray()...) 1591 } 1592 return len(values), nil 1593 } 1594 1595 func (col *fixedLenByteArrayColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1596 n := col.size * rows.Len() 1597 i := len(col.data) 1598 j := len(col.data) + n 1599 1600 if cap(col.data) < j { 1601 col.data = append(make([]byte, 0, max(i+n, 2*cap(col.data))), col.data...) 1602 } 1603 1604 col.data = col.data[:j] 1605 newData := col.data[i:] 1606 1607 for i := 0; i < rows.Len(); i++ { 1608 p := rows.Index(i) 1609 copy(newData[i*col.size:], unsafe.Slice((*byte)(p), col.size)) 1610 } 1611 } 1612 1613 func (col *fixedLenByteArrayColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1614 i := int(offset) * col.size 1615 switch { 1616 case i < 0: 1617 return 0, errRowIndexOutOfBounds(offset, int64(len(col.data)/col.size)) 1618 case i >= len(col.data): 1619 return 0, io.EOF 1620 default: 1621 for n < len(values) && i < len(col.data) { 1622 values[n] = col.makeValueBytes(col.data[i : i+col.size]) 1623 n++ 1624 i += col.size 1625 } 1626 if n < len(values) { 1627 err = io.EOF 1628 } 1629 return n, err 1630 } 1631 } 1632 1633 type uint32ColumnBuffer struct{ uint32Page } 1634 1635 func newUint32ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint32ColumnBuffer { 1636 return &uint32ColumnBuffer{ 1637 uint32Page: uint32Page{ 1638 typ: typ, 1639 values: make([]uint32, 0, numValues), 1640 columnIndex: ^columnIndex, 1641 }, 1642 } 1643 } 1644 1645 func (col *uint32ColumnBuffer) Clone() ColumnBuffer { 1646 return &uint32ColumnBuffer{ 1647 uint32Page: uint32Page{ 1648 typ: col.typ, 1649 values: append([]uint32{}, col.values...), 1650 columnIndex: col.columnIndex, 1651 }, 1652 } 1653 } 1654 1655 func (col *uint32ColumnBuffer) ColumnIndex() ColumnIndex { return uint32ColumnIndex{&col.uint32Page} } 1656 1657 func (col *uint32ColumnBuffer) OffsetIndex() OffsetIndex { return uint32OffsetIndex{&col.uint32Page} } 1658 1659 func (col *uint32ColumnBuffer) BloomFilter() BloomFilter { return nil } 1660 1661 func (col *uint32ColumnBuffer) Dictionary() Dictionary { return nil } 1662 1663 func (col *uint32ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1664 1665 func (col *uint32ColumnBuffer) Page() BufferedPage { return &col.uint32Page } 1666 1667 func (col *uint32ColumnBuffer) Reset() { col.values = col.values[:0] } 1668 1669 func (col *uint32ColumnBuffer) Cap() int { return cap(col.values) } 1670 1671 func (col *uint32ColumnBuffer) Len() int { return len(col.values) } 1672 1673 func (col *uint32ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1674 1675 func (col *uint32ColumnBuffer) Swap(i, j int) { 1676 col.values[i], col.values[j] = col.values[j], col.values[i] 1677 } 1678 1679 func (col *uint32ColumnBuffer) Write(b []byte) (int, error) { 1680 if (len(b) % 4) != 0 { 1681 return 0, fmt.Errorf("cannot write INT32 values from input of size %d", len(b)) 1682 } 1683 col.values = append(col.values, unsafecast.BytesToUint32(b)...) 1684 return len(b), nil 1685 } 1686 1687 func (col *uint32ColumnBuffer) WriteUint32s(values []uint32) (int, error) { 1688 col.values = append(col.values, values...) 1689 return len(values), nil 1690 } 1691 1692 func (col *uint32ColumnBuffer) WriteValues(values []Value) (int, error) { 1693 var model Value 1694 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1695 return len(values), nil 1696 } 1697 1698 func (col *uint32ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1699 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1700 col.values = append(make([]uint32, 0, max(n, 2*cap(col.values))), col.values...) 1701 } 1702 n := len(col.values) 1703 col.values = col.values[:n+rows.Len()] 1704 sparse.GatherUint32(col.values[n:], rows.Uint32Array()) 1705 } 1706 1707 func (col *uint32ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1708 i := int(offset) 1709 switch { 1710 case i < 0: 1711 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1712 case i >= len(col.values): 1713 return 0, io.EOF 1714 default: 1715 for n < len(values) && i < len(col.values) { 1716 values[n] = col.makeValue(col.values[i]) 1717 n++ 1718 i++ 1719 } 1720 if n < len(values) { 1721 err = io.EOF 1722 } 1723 return n, err 1724 } 1725 } 1726 1727 type uint64ColumnBuffer struct{ uint64Page } 1728 1729 func newUint64ColumnBuffer(typ Type, columnIndex int16, numValues int32) *uint64ColumnBuffer { 1730 return &uint64ColumnBuffer{ 1731 uint64Page: uint64Page{ 1732 typ: typ, 1733 values: make([]uint64, 0, numValues), 1734 columnIndex: ^columnIndex, 1735 }, 1736 } 1737 } 1738 1739 func (col *uint64ColumnBuffer) Clone() ColumnBuffer { 1740 return &uint64ColumnBuffer{ 1741 uint64Page: uint64Page{ 1742 typ: col.typ, 1743 values: append([]uint64{}, col.values...), 1744 columnIndex: col.columnIndex, 1745 }, 1746 } 1747 } 1748 1749 func (col *uint64ColumnBuffer) ColumnIndex() ColumnIndex { return uint64ColumnIndex{&col.uint64Page} } 1750 1751 func (col *uint64ColumnBuffer) OffsetIndex() OffsetIndex { return uint64OffsetIndex{&col.uint64Page} } 1752 1753 func (col *uint64ColumnBuffer) BloomFilter() BloomFilter { return nil } 1754 1755 func (col *uint64ColumnBuffer) Dictionary() Dictionary { return nil } 1756 1757 func (col *uint64ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1758 1759 func (col *uint64ColumnBuffer) Page() BufferedPage { return &col.uint64Page } 1760 1761 func (col *uint64ColumnBuffer) Reset() { col.values = col.values[:0] } 1762 1763 func (col *uint64ColumnBuffer) Cap() int { return cap(col.values) } 1764 1765 func (col *uint64ColumnBuffer) Len() int { return len(col.values) } 1766 1767 func (col *uint64ColumnBuffer) Less(i, j int) bool { return col.values[i] < col.values[j] } 1768 1769 func (col *uint64ColumnBuffer) Swap(i, j int) { 1770 col.values[i], col.values[j] = col.values[j], col.values[i] 1771 } 1772 1773 func (col *uint64ColumnBuffer) Write(b []byte) (int, error) { 1774 if (len(b) % 8) != 0 { 1775 return 0, fmt.Errorf("cannot write INT64 values from input of size %d", len(b)) 1776 } 1777 col.values = append(col.values, unsafecast.BytesToUint64(b)...) 1778 return len(b), nil 1779 } 1780 1781 func (col *uint64ColumnBuffer) WriteUint64s(values []uint64) (int, error) { 1782 col.values = append(col.values, values...) 1783 return len(values), nil 1784 } 1785 1786 func (col *uint64ColumnBuffer) WriteValues(values []Value) (int, error) { 1787 var model Value 1788 col.writeValues(makeArrayValue(values, unsafe.Offsetof(model.u64)), columnLevels{}) 1789 return len(values), nil 1790 } 1791 1792 func (col *uint64ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1793 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1794 col.values = append(make([]uint64, 0, max(n, 2*cap(col.values))), col.values...) 1795 } 1796 n := len(col.values) 1797 col.values = col.values[:n+rows.Len()] 1798 sparse.GatherUint64(col.values[n:], rows.Uint64Array()) 1799 } 1800 1801 func (col *uint64ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1802 i := int(offset) 1803 switch { 1804 case i < 0: 1805 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1806 case i >= len(col.values): 1807 return 0, io.EOF 1808 default: 1809 for n < len(values) && i < len(col.values) { 1810 values[n] = col.makeValue(col.values[i]) 1811 n++ 1812 i++ 1813 } 1814 if n < len(values) { 1815 err = io.EOF 1816 } 1817 return n, err 1818 } 1819 } 1820 1821 type be128ColumnBuffer struct{ be128Page } 1822 1823 func newBE128ColumnBuffer(typ Type, columnIndex int16, numValues int32) *be128ColumnBuffer { 1824 return &be128ColumnBuffer{ 1825 be128Page: be128Page{ 1826 typ: typ, 1827 values: make([][16]byte, 0, numValues), 1828 columnIndex: ^columnIndex, 1829 }, 1830 } 1831 } 1832 1833 func (col *be128ColumnBuffer) Clone() ColumnBuffer { 1834 return &be128ColumnBuffer{ 1835 be128Page: be128Page{ 1836 typ: col.typ, 1837 values: append([][16]byte{}, col.values...), 1838 columnIndex: col.columnIndex, 1839 }, 1840 } 1841 } 1842 1843 func (col *be128ColumnBuffer) ColumnIndex() ColumnIndex { 1844 return be128ColumnIndex{&col.be128Page} 1845 } 1846 1847 func (col *be128ColumnBuffer) OffsetIndex() OffsetIndex { 1848 return be128OffsetIndex{&col.be128Page} 1849 } 1850 1851 func (col *be128ColumnBuffer) BloomFilter() BloomFilter { return nil } 1852 1853 func (col *be128ColumnBuffer) Dictionary() Dictionary { return nil } 1854 1855 func (col *be128ColumnBuffer) Pages() Pages { return onePage(col.Page()) } 1856 1857 func (col *be128ColumnBuffer) Page() BufferedPage { return &col.be128Page } 1858 1859 func (col *be128ColumnBuffer) Reset() { col.values = col.values[:0] } 1860 1861 func (col *be128ColumnBuffer) Cap() int { return cap(col.values) } 1862 1863 func (col *be128ColumnBuffer) Len() int { return len(col.values) } 1864 1865 func (col *be128ColumnBuffer) Less(i, j int) bool { 1866 return lessBE128(&col.values[i], &col.values[j]) 1867 } 1868 1869 func (col *be128ColumnBuffer) Swap(i, j int) { 1870 col.values[i], col.values[j] = col.values[j], col.values[i] 1871 } 1872 1873 func (col *be128ColumnBuffer) WriteValues(values []Value) (int, error) { 1874 if n := len(col.values) + len(values); n > cap(col.values) { 1875 col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) 1876 } 1877 n := len(col.values) 1878 col.values = col.values[:n+len(values)] 1879 newValues := col.values[n:] 1880 for i, v := range values { 1881 copy(newValues[i][:], v.ByteArray()) 1882 } 1883 return len(values), nil 1884 } 1885 1886 func (col *be128ColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) { 1887 if n := len(col.values) + rows.Len(); n > cap(col.values) { 1888 col.values = append(make([][16]byte, 0, max(n, 2*cap(col.values))), col.values...) 1889 } 1890 n := len(col.values) 1891 col.values = col.values[:n+rows.Len()] 1892 sparse.GatherUint128(col.values[n:], rows.Uint128Array()) 1893 } 1894 1895 func (col *be128ColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) { 1896 i := int(offset) 1897 switch { 1898 case i < 0: 1899 return 0, errRowIndexOutOfBounds(offset, int64(len(col.values))) 1900 case i >= len(col.values): 1901 return 0, io.EOF 1902 default: 1903 for n < len(values) && i < len(col.values) { 1904 values[n] = col.makeValue(&col.values[i]) 1905 n++ 1906 i++ 1907 } 1908 if n < len(values) { 1909 err = io.EOF 1910 } 1911 return n, err 1912 } 1913 } 1914 1915 var ( 1916 _ sort.Interface = (ColumnBuffer)(nil) 1917 _ io.Writer = (*byteArrayColumnBuffer)(nil) 1918 _ io.Writer = (*fixedLenByteArrayColumnBuffer)(nil) 1919 )