github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column_index.go (about) 1 package parquet 2 3 import ( 4 "github.com/parquet-go/parquet-go/deprecated" 5 "github.com/parquet-go/parquet-go/encoding/plain" 6 "github.com/parquet-go/parquet-go/format" 7 "github.com/parquet-go/parquet-go/internal/unsafecast" 8 ) 9 10 type ColumnIndex interface { 11 // NumPages returns the number of paged in the column index. 12 NumPages() int 13 14 // Returns the number of null values in the page at the given index. 15 NullCount(int) int64 16 17 // Tells whether the page at the given index contains null values only. 18 NullPage(int) bool 19 20 // PageIndex return min/max bounds for the page at the given index in the 21 // column. 22 MinValue(int) Value 23 MaxValue(int) Value 24 25 // IsAscending returns true if the column index min/max values are sorted 26 // in ascending order (based on the ordering rules of the column's logical 27 // type). 28 IsAscending() bool 29 30 // IsDescending returns true if the column index min/max values are sorted 31 // in descending order (based on the ordering rules of the column's logical 32 // type). 33 IsDescending() bool 34 } 35 36 // NewColumnIndex constructs a ColumnIndex instance from the given parquet 37 // format column index. The kind argument configures the type of values 38 func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex { 39 return &formatColumnIndex{ 40 kind: kind, 41 index: index, 42 } 43 } 44 45 type formatColumnIndex struct { 46 kind Kind 47 index *format.ColumnIndex 48 } 49 50 func (f *formatColumnIndex) NumPages() int { 51 return len(f.index.MinValues) 52 } 53 54 func (f *formatColumnIndex) NullCount(i int) int64 { 55 if len(f.index.NullCounts) > 0 { 56 return f.index.NullCounts[i] 57 } 58 return 0 59 } 60 61 func (f *formatColumnIndex) NullPage(i int) bool { 62 return len(f.index.NullPages) > 0 && f.index.NullPages[i] 63 } 64 65 func (f *formatColumnIndex) MinValue(i int) Value { 66 if f.NullPage(i) { 67 return Value{} 68 } 69 return f.kind.Value(f.index.MinValues[i]) 70 } 71 72 func (f *formatColumnIndex) MaxValue(i int) Value { 73 if f.NullPage(i) { 74 return Value{} 75 } 76 return f.kind.Value(f.index.MaxValues[i]) 77 } 78 79 func (f *formatColumnIndex) IsAscending() bool { 80 return f.index.BoundaryOrder == format.Ascending 81 } 82 83 func (f *formatColumnIndex) IsDescending() bool { 84 return f.index.BoundaryOrder == format.Descending 85 } 86 87 type fileColumnIndex struct{ chunk *fileColumnChunk } 88 89 func (i fileColumnIndex) NumPages() int { 90 return len(i.chunk.columnIndex.NullPages) 91 } 92 93 func (i fileColumnIndex) NullCount(j int) int64 { 94 if len(i.chunk.columnIndex.NullCounts) > 0 { 95 return i.chunk.columnIndex.NullCounts[j] 96 } 97 return 0 98 } 99 100 func (i fileColumnIndex) NullPage(j int) bool { 101 return len(i.chunk.columnIndex.NullPages) > 0 && i.chunk.columnIndex.NullPages[j] 102 } 103 104 func (i fileColumnIndex) MinValue(j int) Value { 105 if i.NullPage(j) { 106 return Value{} 107 } 108 return i.makeValue(i.chunk.columnIndex.MinValues[j]) 109 } 110 111 func (i fileColumnIndex) MaxValue(j int) Value { 112 if i.NullPage(j) { 113 return Value{} 114 } 115 return i.makeValue(i.chunk.columnIndex.MaxValues[j]) 116 } 117 118 func (i fileColumnIndex) IsAscending() bool { 119 return i.chunk.columnIndex.BoundaryOrder == format.Ascending 120 } 121 122 func (i fileColumnIndex) IsDescending() bool { 123 return i.chunk.columnIndex.BoundaryOrder == format.Descending 124 } 125 126 func (i *fileColumnIndex) makeValue(b []byte) Value { 127 return i.chunk.column.typ.Kind().Value(b) 128 } 129 130 type emptyColumnIndex struct{} 131 132 func (emptyColumnIndex) NumPages() int { return 0 } 133 func (emptyColumnIndex) NullCount(int) int64 { return 0 } 134 func (emptyColumnIndex) NullPage(int) bool { return false } 135 func (emptyColumnIndex) MinValue(int) Value { return Value{} } 136 func (emptyColumnIndex) MaxValue(int) Value { return Value{} } 137 func (emptyColumnIndex) IsAscending() bool { return false } 138 func (emptyColumnIndex) IsDescending() bool { return false } 139 140 type booleanColumnIndex struct{ page *booleanPage } 141 142 func (i booleanColumnIndex) NumPages() int { return 1 } 143 func (i booleanColumnIndex) NullCount(int) int64 { return 0 } 144 func (i booleanColumnIndex) NullPage(int) bool { return false } 145 func (i booleanColumnIndex) MinValue(int) Value { return makeValueBoolean(i.page.min()) } 146 func (i booleanColumnIndex) MaxValue(int) Value { return makeValueBoolean(i.page.max()) } 147 func (i booleanColumnIndex) IsAscending() bool { return false } 148 func (i booleanColumnIndex) IsDescending() bool { return false } 149 150 type int32ColumnIndex struct{ page *int32Page } 151 152 func (i int32ColumnIndex) NumPages() int { return 1 } 153 func (i int32ColumnIndex) NullCount(int) int64 { return 0 } 154 func (i int32ColumnIndex) NullPage(int) bool { return false } 155 func (i int32ColumnIndex) MinValue(int) Value { return makeValueInt32(i.page.min()) } 156 func (i int32ColumnIndex) MaxValue(int) Value { return makeValueInt32(i.page.max()) } 157 func (i int32ColumnIndex) IsAscending() bool { return false } 158 func (i int32ColumnIndex) IsDescending() bool { return false } 159 160 type int64ColumnIndex struct{ page *int64Page } 161 162 func (i int64ColumnIndex) NumPages() int { return 1 } 163 func (i int64ColumnIndex) NullCount(int) int64 { return 0 } 164 func (i int64ColumnIndex) NullPage(int) bool { return false } 165 func (i int64ColumnIndex) MinValue(int) Value { return makeValueInt64(i.page.min()) } 166 func (i int64ColumnIndex) MaxValue(int) Value { return makeValueInt64(i.page.max()) } 167 func (i int64ColumnIndex) IsAscending() bool { return false } 168 func (i int64ColumnIndex) IsDescending() bool { return false } 169 170 type int96ColumnIndex struct{ page *int96Page } 171 172 func (i int96ColumnIndex) NumPages() int { return 1 } 173 func (i int96ColumnIndex) NullCount(int) int64 { return 0 } 174 func (i int96ColumnIndex) NullPage(int) bool { return false } 175 func (i int96ColumnIndex) MinValue(int) Value { return makeValueInt96(i.page.min()) } 176 func (i int96ColumnIndex) MaxValue(int) Value { return makeValueInt96(i.page.max()) } 177 func (i int96ColumnIndex) IsAscending() bool { return false } 178 func (i int96ColumnIndex) IsDescending() bool { return false } 179 180 type floatColumnIndex struct{ page *floatPage } 181 182 func (i floatColumnIndex) NumPages() int { return 1 } 183 func (i floatColumnIndex) NullCount(int) int64 { return 0 } 184 func (i floatColumnIndex) NullPage(int) bool { return false } 185 func (i floatColumnIndex) MinValue(int) Value { return makeValueFloat(i.page.min()) } 186 func (i floatColumnIndex) MaxValue(int) Value { return makeValueFloat(i.page.max()) } 187 func (i floatColumnIndex) IsAscending() bool { return false } 188 func (i floatColumnIndex) IsDescending() bool { return false } 189 190 type doubleColumnIndex struct{ page *doublePage } 191 192 func (i doubleColumnIndex) NumPages() int { return 1 } 193 func (i doubleColumnIndex) NullCount(int) int64 { return 0 } 194 func (i doubleColumnIndex) NullPage(int) bool { return false } 195 func (i doubleColumnIndex) MinValue(int) Value { return makeValueDouble(i.page.min()) } 196 func (i doubleColumnIndex) MaxValue(int) Value { return makeValueDouble(i.page.max()) } 197 func (i doubleColumnIndex) IsAscending() bool { return false } 198 func (i doubleColumnIndex) IsDescending() bool { return false } 199 200 type byteArrayColumnIndex struct{ page *byteArrayPage } 201 202 func (i byteArrayColumnIndex) NumPages() int { return 1 } 203 func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 } 204 func (i byteArrayColumnIndex) NullPage(int) bool { return false } 205 func (i byteArrayColumnIndex) MinValue(int) Value { return makeValueBytes(ByteArray, i.page.min()) } 206 func (i byteArrayColumnIndex) MaxValue(int) Value { return makeValueBytes(ByteArray, i.page.max()) } 207 func (i byteArrayColumnIndex) IsAscending() bool { return false } 208 func (i byteArrayColumnIndex) IsDescending() bool { return false } 209 210 type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage } 211 212 func (i fixedLenByteArrayColumnIndex) NumPages() int { return 1 } 213 func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 } 214 func (i fixedLenByteArrayColumnIndex) NullPage(int) bool { return false } 215 func (i fixedLenByteArrayColumnIndex) MinValue(int) Value { 216 return makeValueBytes(FixedLenByteArray, i.page.min()) 217 } 218 func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value { 219 return makeValueBytes(FixedLenByteArray, i.page.max()) 220 } 221 func (i fixedLenByteArrayColumnIndex) IsAscending() bool { return false } 222 func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false } 223 224 type uint32ColumnIndex struct{ page *uint32Page } 225 226 func (i uint32ColumnIndex) NumPages() int { return 1 } 227 func (i uint32ColumnIndex) NullCount(int) int64 { return 0 } 228 func (i uint32ColumnIndex) NullPage(int) bool { return false } 229 func (i uint32ColumnIndex) MinValue(int) Value { return makeValueUint32(i.page.min()) } 230 func (i uint32ColumnIndex) MaxValue(int) Value { return makeValueUint32(i.page.max()) } 231 func (i uint32ColumnIndex) IsAscending() bool { return false } 232 func (i uint32ColumnIndex) IsDescending() bool { return false } 233 234 type uint64ColumnIndex struct{ page *uint64Page } 235 236 func (i uint64ColumnIndex) NumPages() int { return 1 } 237 func (i uint64ColumnIndex) NullCount(int) int64 { return 0 } 238 func (i uint64ColumnIndex) NullPage(int) bool { return false } 239 func (i uint64ColumnIndex) MinValue(int) Value { return makeValueUint64(i.page.min()) } 240 func (i uint64ColumnIndex) MaxValue(int) Value { return makeValueUint64(i.page.max()) } 241 func (i uint64ColumnIndex) IsAscending() bool { return false } 242 func (i uint64ColumnIndex) IsDescending() bool { return false } 243 244 type be128ColumnIndex struct{ page *be128Page } 245 246 func (i be128ColumnIndex) NumPages() int { return 1 } 247 func (i be128ColumnIndex) NullCount(int) int64 { return 0 } 248 func (i be128ColumnIndex) NullPage(int) bool { return false } 249 func (i be128ColumnIndex) MinValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.min()) } 250 func (i be128ColumnIndex) MaxValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.max()) } 251 func (i be128ColumnIndex) IsAscending() bool { return false } 252 func (i be128ColumnIndex) IsDescending() bool { return false } 253 254 // The ColumnIndexer interface is implemented by types that support generating 255 // parquet column indexes. 256 // 257 // The package does not export any types that implement this interface, programs 258 // must call NewColumnIndexer on a Type instance to construct column indexers. 259 type ColumnIndexer interface { 260 // Resets the column indexer state. 261 Reset() 262 263 // Add a page to the column indexer. 264 IndexPage(numValues, numNulls int64, min, max Value) 265 266 // Generates a format.ColumnIndex value from the current state of the 267 // column indexer. 268 // 269 // The returned value may reference internal buffers, in which case the 270 // values remain valid until the next call to IndexPage or Reset on the 271 // column indexer. 272 ColumnIndex() format.ColumnIndex 273 } 274 275 type baseColumnIndexer struct { 276 nullPages []bool 277 nullCounts []int64 278 } 279 280 func (i *baseColumnIndexer) reset() { 281 i.nullPages = i.nullPages[:0] 282 i.nullCounts = i.nullCounts[:0] 283 } 284 285 func (i *baseColumnIndexer) observe(numValues, numNulls int64) { 286 i.nullPages = append(i.nullPages, numValues == numNulls) 287 i.nullCounts = append(i.nullCounts, numNulls) 288 } 289 290 func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex { 291 nullPages := make([]bool, len(i.nullPages)) 292 copy(nullPages, i.nullPages) 293 nullCounts := make([]int64, len(i.nullCounts)) 294 copy(nullCounts, i.nullCounts) 295 return format.ColumnIndex{ 296 NullPages: nullPages, 297 NullCounts: nullCounts, 298 MinValues: minValues, 299 MaxValues: maxValues, 300 BoundaryOrder: boundaryOrderOf(minOrder, maxOrder), 301 } 302 } 303 304 type booleanColumnIndexer struct { 305 baseColumnIndexer 306 minValues []bool 307 maxValues []bool 308 } 309 310 func newBooleanColumnIndexer() *booleanColumnIndexer { 311 return new(booleanColumnIndexer) 312 } 313 314 func (i *booleanColumnIndexer) Reset() { 315 i.reset() 316 i.minValues = i.minValues[:0] 317 i.maxValues = i.maxValues[:0] 318 } 319 320 func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 321 i.observe(numValues, numNulls) 322 i.minValues = append(i.minValues, min.boolean()) 323 i.maxValues = append(i.maxValues, max.boolean()) 324 } 325 326 func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex { 327 return i.columnIndex( 328 splitFixedLenByteArrays(unsafecast.BoolToBytes(i.minValues), 1), 329 splitFixedLenByteArrays(unsafecast.BoolToBytes(i.maxValues), 1), 330 orderOfBool(i.minValues), 331 orderOfBool(i.maxValues), 332 ) 333 } 334 335 type int32ColumnIndexer struct { 336 baseColumnIndexer 337 minValues []int32 338 maxValues []int32 339 } 340 341 func newInt32ColumnIndexer() *int32ColumnIndexer { 342 return new(int32ColumnIndexer) 343 } 344 345 func (i *int32ColumnIndexer) Reset() { 346 i.reset() 347 i.minValues = i.minValues[:0] 348 i.maxValues = i.maxValues[:0] 349 } 350 351 func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 352 i.observe(numValues, numNulls) 353 i.minValues = append(i.minValues, min.int32()) 354 i.maxValues = append(i.maxValues, max.int32()) 355 } 356 357 func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex { 358 return i.columnIndex( 359 splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.minValues), 4), 360 splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.maxValues), 4), 361 orderOfInt32(i.minValues), 362 orderOfInt32(i.maxValues), 363 ) 364 } 365 366 type int64ColumnIndexer struct { 367 baseColumnIndexer 368 minValues []int64 369 maxValues []int64 370 } 371 372 func newInt64ColumnIndexer() *int64ColumnIndexer { 373 return new(int64ColumnIndexer) 374 } 375 376 func (i *int64ColumnIndexer) Reset() { 377 i.reset() 378 i.minValues = i.minValues[:0] 379 i.maxValues = i.maxValues[:0] 380 } 381 382 func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 383 i.observe(numValues, numNulls) 384 i.minValues = append(i.minValues, min.int64()) 385 i.maxValues = append(i.maxValues, max.int64()) 386 } 387 388 func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex { 389 return i.columnIndex( 390 splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.minValues), 8), 391 splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.maxValues), 8), 392 orderOfInt64(i.minValues), 393 orderOfInt64(i.maxValues), 394 ) 395 } 396 397 type int96ColumnIndexer struct { 398 baseColumnIndexer 399 minValues []deprecated.Int96 400 maxValues []deprecated.Int96 401 } 402 403 func newInt96ColumnIndexer() *int96ColumnIndexer { 404 return new(int96ColumnIndexer) 405 } 406 407 func (i *int96ColumnIndexer) Reset() { 408 i.reset() 409 i.minValues = i.minValues[:0] 410 i.maxValues = i.maxValues[:0] 411 } 412 413 func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 414 i.observe(numValues, numNulls) 415 i.minValues = append(i.minValues, min.Int96()) 416 i.maxValues = append(i.maxValues, max.Int96()) 417 } 418 419 func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex { 420 return i.columnIndex( 421 splitFixedLenByteArrays(deprecated.Int96ToBytes(i.minValues), 12), 422 splitFixedLenByteArrays(deprecated.Int96ToBytes(i.maxValues), 12), 423 deprecated.OrderOfInt96(i.minValues), 424 deprecated.OrderOfInt96(i.maxValues), 425 ) 426 } 427 428 type floatColumnIndexer struct { 429 baseColumnIndexer 430 minValues []float32 431 maxValues []float32 432 } 433 434 func newFloatColumnIndexer() *floatColumnIndexer { 435 return new(floatColumnIndexer) 436 } 437 438 func (i *floatColumnIndexer) Reset() { 439 i.reset() 440 i.minValues = i.minValues[:0] 441 i.maxValues = i.maxValues[:0] 442 } 443 444 func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 445 i.observe(numValues, numNulls) 446 i.minValues = append(i.minValues, min.float()) 447 i.maxValues = append(i.maxValues, max.float()) 448 } 449 450 func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex { 451 return i.columnIndex( 452 splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.minValues), 4), 453 splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.maxValues), 4), 454 orderOfFloat32(i.minValues), 455 orderOfFloat32(i.maxValues), 456 ) 457 } 458 459 type doubleColumnIndexer struct { 460 baseColumnIndexer 461 minValues []float64 462 maxValues []float64 463 } 464 465 func newDoubleColumnIndexer() *doubleColumnIndexer { 466 return new(doubleColumnIndexer) 467 } 468 469 func (i *doubleColumnIndexer) Reset() { 470 i.reset() 471 i.minValues = i.minValues[:0] 472 i.maxValues = i.maxValues[:0] 473 } 474 475 func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 476 i.observe(numValues, numNulls) 477 i.minValues = append(i.minValues, min.double()) 478 i.maxValues = append(i.maxValues, max.double()) 479 } 480 481 func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex { 482 return i.columnIndex( 483 splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.minValues), 8), 484 splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.maxValues), 8), 485 orderOfFloat64(i.minValues), 486 orderOfFloat64(i.maxValues), 487 ) 488 } 489 490 type byteArrayColumnIndexer struct { 491 baseColumnIndexer 492 sizeLimit int 493 minValues []byte 494 maxValues []byte 495 } 496 497 func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { 498 return &byteArrayColumnIndexer{sizeLimit: sizeLimit} 499 } 500 501 func (i *byteArrayColumnIndexer) Reset() { 502 i.reset() 503 i.minValues = i.minValues[:0] 504 i.maxValues = i.maxValues[:0] 505 } 506 507 func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 508 i.observe(numValues, numNulls) 509 i.minValues = plain.AppendByteArray(i.minValues, min.byteArray()) 510 i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray()) 511 } 512 513 func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { 514 minValues := splitByteArrays(i.minValues) 515 maxValues := splitByteArrays(i.maxValues) 516 if sizeLimit := i.sizeLimit; sizeLimit > 0 { 517 for i, v := range minValues { 518 minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) 519 } 520 for i, v := range maxValues { 521 maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) 522 } 523 } 524 return i.columnIndex( 525 minValues, 526 maxValues, 527 orderOfBytes(minValues), 528 orderOfBytes(maxValues), 529 ) 530 } 531 532 type fixedLenByteArrayColumnIndexer struct { 533 baseColumnIndexer 534 size int 535 sizeLimit int 536 minValues []byte 537 maxValues []byte 538 } 539 540 func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer { 541 return &fixedLenByteArrayColumnIndexer{ 542 size: size, 543 sizeLimit: sizeLimit, 544 } 545 } 546 547 func (i *fixedLenByteArrayColumnIndexer) Reset() { 548 i.reset() 549 i.minValues = i.minValues[:0] 550 i.maxValues = i.maxValues[:0] 551 } 552 553 func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 554 i.observe(numValues, numNulls) 555 i.minValues = append(i.minValues, min.byteArray()...) 556 i.maxValues = append(i.maxValues, max.byteArray()...) 557 } 558 559 func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { 560 minValues := splitFixedLenByteArrays(i.minValues, i.size) 561 maxValues := splitFixedLenByteArrays(i.maxValues, i.size) 562 if sizeLimit := i.sizeLimit; sizeLimit > 0 { 563 for i, v := range minValues { 564 minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) 565 } 566 for i, v := range maxValues { 567 maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) 568 } 569 } 570 return i.columnIndex( 571 minValues, 572 maxValues, 573 orderOfBytes(minValues), 574 orderOfBytes(maxValues), 575 ) 576 } 577 578 type uint32ColumnIndexer struct { 579 baseColumnIndexer 580 minValues []uint32 581 maxValues []uint32 582 } 583 584 func newUint32ColumnIndexer() *uint32ColumnIndexer { 585 return new(uint32ColumnIndexer) 586 } 587 588 func (i *uint32ColumnIndexer) Reset() { 589 i.reset() 590 i.minValues = i.minValues[:0] 591 i.maxValues = i.maxValues[:0] 592 } 593 594 func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 595 i.observe(numValues, numNulls) 596 i.minValues = append(i.minValues, min.uint32()) 597 i.maxValues = append(i.maxValues, max.uint32()) 598 } 599 600 func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex { 601 return i.columnIndex( 602 splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.minValues), 4), 603 splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.maxValues), 4), 604 orderOfUint32(i.minValues), 605 orderOfUint32(i.maxValues), 606 ) 607 } 608 609 type uint64ColumnIndexer struct { 610 baseColumnIndexer 611 minValues []uint64 612 maxValues []uint64 613 } 614 615 func newUint64ColumnIndexer() *uint64ColumnIndexer { 616 return new(uint64ColumnIndexer) 617 } 618 619 func (i *uint64ColumnIndexer) Reset() { 620 i.reset() 621 i.minValues = i.minValues[:0] 622 i.maxValues = i.maxValues[:0] 623 } 624 625 func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 626 i.observe(numValues, numNulls) 627 i.minValues = append(i.minValues, min.uint64()) 628 i.maxValues = append(i.maxValues, max.uint64()) 629 } 630 631 func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex { 632 return i.columnIndex( 633 splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.minValues), 8), 634 splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.maxValues), 8), 635 orderOfUint64(i.minValues), 636 orderOfUint64(i.maxValues), 637 ) 638 } 639 640 type be128ColumnIndexer struct { 641 baseColumnIndexer 642 minValues [][16]byte 643 maxValues [][16]byte 644 } 645 646 func newBE128ColumnIndexer() *be128ColumnIndexer { 647 return new(be128ColumnIndexer) 648 } 649 650 func (i *be128ColumnIndexer) Reset() { 651 i.reset() 652 i.minValues = i.minValues[:0] 653 i.maxValues = i.maxValues[:0] 654 } 655 656 func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 657 i.observe(numValues, numNulls) 658 if !min.IsNull() { 659 i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray())) 660 } 661 if !max.IsNull() { 662 i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray())) 663 } 664 } 665 666 func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex { 667 minValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.minValues), 16) 668 maxValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.maxValues), 16) 669 return i.columnIndex( 670 minValues, 671 maxValues, 672 orderOfBytes(minValues), 673 orderOfBytes(maxValues), 674 ) 675 } 676 677 func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte { 678 if len(value) > sizeLimit { 679 value = value[:sizeLimit] 680 } 681 return value 682 } 683 684 // truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit. 685 // If the given byte array is truncated, it is incremented by 1 in place. 686 func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte { 687 if len(value) > sizeLimit { 688 value = value[:sizeLimit] 689 incrementByteArrayInplace(value) 690 } 691 return value 692 } 693 694 // incrementByteArray increments the given byte array by 1. 695 // Reference: https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124 696 func incrementByteArrayInplace(value []byte) { 697 for i := len(value) - 1; i >= 0; i-- { 698 value[i]++ 699 if value[i] != 0 { // Did not overflow: 0xFF -> 0x00 700 return 701 } 702 } 703 // Fully overflowed, so restore all to 0xFF 704 for i := range value { 705 value[i] = 0xFF 706 } 707 } 708 709 func splitByteArrays(data []byte) [][]byte { 710 length := 0 711 plain.RangeByteArray(data, func([]byte) error { 712 length++ 713 return nil 714 }) 715 buffer := make([]byte, 0, len(data)-(4*length)) 716 values := make([][]byte, 0, length) 717 plain.RangeByteArray(data, func(value []byte) error { 718 offset := len(buffer) 719 buffer = append(buffer, value...) 720 values = append(values, buffer[offset:]) 721 return nil 722 }) 723 return values 724 } 725 726 func splitFixedLenByteArrays(data []byte, size int) [][]byte { 727 data = copyBytes(data) 728 values := make([][]byte, len(data)/size) 729 for i := range values { 730 j := (i + 0) * size 731 k := (i + 1) * size 732 values[i] = data[j:k:k] 733 } 734 return values 735 } 736 737 func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder { 738 if minOrder == maxOrder { 739 switch { 740 case minOrder > 0: 741 return format.Ascending 742 case minOrder < 0: 743 return format.Descending 744 } 745 } 746 return format.Unordered 747 }