github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/column_index.go (about) 1 package parquet 2 3 import ( 4 "github.com/segmentio/parquet-go/deprecated" 5 "github.com/segmentio/parquet-go/encoding/plain" 6 "github.com/segmentio/parquet-go/format" 7 "github.com/segmentio/parquet-go/internal/unsafecast" 8 ) 9 10 type ColumnIndex interface { 11 // NumPages returns the number of paged in the column index. 12 NumPages() int 13 14 // Returns the number of null values in the page at the given index. 15 NullCount(int) int64 16 17 // Tells whether the page at the given index contains null values only. 18 NullPage(int) bool 19 20 // PageIndex return min/max bounds for the page at the given index in the 21 // column. 22 MinValue(int) Value 23 MaxValue(int) Value 24 25 // IsAscending returns true if the column index min/max values are sorted 26 // in ascending order (based on the ordering rules of the column's logical 27 // type). 28 IsAscending() bool 29 30 // IsDescending returns true if the column index min/max values are sorted 31 // in descending order (based on the ordering rules of the column's logical 32 // type). 33 IsDescending() bool 34 } 35 36 // NewColumnIndex constructs a ColumnIndex instance from the given parquet 37 // format column index. The kind argument configures the type of values 38 func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex { 39 return &formatColumnIndex{ 40 kind: kind, 41 index: index, 42 } 43 } 44 45 type formatColumnIndex struct { 46 kind Kind 47 index *format.ColumnIndex 48 } 49 50 func (f *formatColumnIndex) NumPages() int { 51 return len(f.index.MinValues) 52 } 53 54 func (f *formatColumnIndex) NullCount(i int) int64 { 55 if len(f.index.NullCounts) > 0 { 56 return f.index.NullCounts[i] 57 } 58 return 0 59 } 60 61 func (f *formatColumnIndex) NullPage(i int) bool { 62 return len(f.index.NullPages) > 0 && f.index.NullPages[i] 63 } 64 65 func (f *formatColumnIndex) MinValue(i int) Value { 66 if f.NullPage(i) { 67 return Value{} 68 } 69 return f.kind.Value(f.index.MinValues[i]) 70 } 71 72 func (f *formatColumnIndex) MaxValue(i int) Value { 73 if f.NullPage(i) { 74 return Value{} 75 } 76 return f.kind.Value(f.index.MaxValues[i]) 77 } 78 79 func (f *formatColumnIndex) IsAscending() bool { 80 return f.index.BoundaryOrder == format.Ascending 81 } 82 83 func (f *formatColumnIndex) IsDescending() bool { 84 return f.index.BoundaryOrder == format.Descending 85 } 86 87 type fileColumnIndex struct{ chunk *fileColumnChunk } 88 89 func (i fileColumnIndex) NumPages() int { 90 return len(i.chunk.columnIndex.NullPages) 91 } 92 93 func (i fileColumnIndex) NullCount(j int) int64 { 94 if len(i.chunk.columnIndex.NullCounts) > 0 { 95 return i.chunk.columnIndex.NullCounts[j] 96 } 97 return 0 98 } 99 100 func (i fileColumnIndex) NullPage(j int) bool { 101 return len(i.chunk.columnIndex.NullPages) > 0 && i.chunk.columnIndex.NullPages[j] 102 } 103 104 func (i fileColumnIndex) MinValue(j int) Value { 105 if i.NullPage(j) { 106 return Value{} 107 } 108 return i.makeValue(i.chunk.columnIndex.MinValues[j]) 109 } 110 111 func (i fileColumnIndex) MaxValue(j int) Value { 112 if i.NullPage(j) { 113 return Value{} 114 } 115 return i.makeValue(i.chunk.columnIndex.MaxValues[j]) 116 } 117 118 func (i fileColumnIndex) IsAscending() bool { 119 return i.chunk.columnIndex.BoundaryOrder == format.Ascending 120 } 121 122 func (i fileColumnIndex) IsDescending() bool { 123 return i.chunk.columnIndex.BoundaryOrder == format.Descending 124 } 125 126 func (i *fileColumnIndex) makeValue(b []byte) Value { 127 return i.chunk.column.typ.Kind().Value(b) 128 } 129 130 type emptyColumnIndex struct{} 131 132 func (emptyColumnIndex) NumPages() int { return 0 } 133 func (emptyColumnIndex) NullCount(int) int64 { return 0 } 134 func (emptyColumnIndex) NullPage(int) bool { return false } 135 func (emptyColumnIndex) MinValue(int) Value { return Value{} } 136 func (emptyColumnIndex) MaxValue(int) Value { return Value{} } 137 func (emptyColumnIndex) IsAscending() bool { return false } 138 func (emptyColumnIndex) IsDescending() bool { return false } 139 140 type booleanColumnIndex struct{ page *booleanPage } 141 142 func (i booleanColumnIndex) NumPages() int { return 1 } 143 func (i booleanColumnIndex) NullCount(int) int64 { return 0 } 144 func (i booleanColumnIndex) NullPage(int) bool { return false } 145 func (i booleanColumnIndex) MinValue(int) Value { return makeValueBoolean(i.page.min()) } 146 func (i booleanColumnIndex) MaxValue(int) Value { return makeValueBoolean(i.page.max()) } 147 func (i booleanColumnIndex) IsAscending() bool { return false } 148 func (i booleanColumnIndex) IsDescending() bool { return false } 149 150 type int32ColumnIndex struct{ page *int32Page } 151 152 func (i int32ColumnIndex) NumPages() int { return 1 } 153 func (i int32ColumnIndex) NullCount(int) int64 { return 0 } 154 func (i int32ColumnIndex) NullPage(int) bool { return false } 155 func (i int32ColumnIndex) MinValue(int) Value { return makeValueInt32(i.page.min()) } 156 func (i int32ColumnIndex) MaxValue(int) Value { return makeValueInt32(i.page.max()) } 157 func (i int32ColumnIndex) IsAscending() bool { return false } 158 func (i int32ColumnIndex) IsDescending() bool { return false } 159 160 type int64ColumnIndex struct{ page *int64Page } 161 162 func (i int64ColumnIndex) NumPages() int { return 1 } 163 func (i int64ColumnIndex) NullCount(int) int64 { return 0 } 164 func (i int64ColumnIndex) NullPage(int) bool { return false } 165 func (i int64ColumnIndex) MinValue(int) Value { return makeValueInt64(i.page.min()) } 166 func (i int64ColumnIndex) MaxValue(int) Value { return makeValueInt64(i.page.max()) } 167 func (i int64ColumnIndex) IsAscending() bool { return false } 168 func (i int64ColumnIndex) IsDescending() bool { return false } 169 170 type int96ColumnIndex struct{ page *int96Page } 171 172 func (i int96ColumnIndex) NumPages() int { return 1 } 173 func (i int96ColumnIndex) NullCount(int) int64 { return 0 } 174 func (i int96ColumnIndex) NullPage(int) bool { return false } 175 func (i int96ColumnIndex) MinValue(int) Value { return makeValueInt96(i.page.min()) } 176 func (i int96ColumnIndex) MaxValue(int) Value { return makeValueInt96(i.page.max()) } 177 func (i int96ColumnIndex) IsAscending() bool { return false } 178 func (i int96ColumnIndex) IsDescending() bool { return false } 179 180 type floatColumnIndex struct{ page *floatPage } 181 182 func (i floatColumnIndex) NumPages() int { return 1 } 183 func (i floatColumnIndex) NullCount(int) int64 { return 0 } 184 func (i floatColumnIndex) NullPage(int) bool { return false } 185 func (i floatColumnIndex) MinValue(int) Value { return makeValueFloat(i.page.min()) } 186 func (i floatColumnIndex) MaxValue(int) Value { return makeValueFloat(i.page.max()) } 187 func (i floatColumnIndex) IsAscending() bool { return false } 188 func (i floatColumnIndex) IsDescending() bool { return false } 189 190 type doubleColumnIndex struct{ page *doublePage } 191 192 func (i doubleColumnIndex) NumPages() int { return 1 } 193 func (i doubleColumnIndex) NullCount(int) int64 { return 0 } 194 func (i doubleColumnIndex) NullPage(int) bool { return false } 195 func (i doubleColumnIndex) MinValue(int) Value { return makeValueDouble(i.page.min()) } 196 func (i doubleColumnIndex) MaxValue(int) Value { return makeValueDouble(i.page.max()) } 197 func (i doubleColumnIndex) IsAscending() bool { return false } 198 func (i doubleColumnIndex) IsDescending() bool { return false } 199 200 type byteArrayColumnIndex struct{ page *byteArrayPage } 201 202 func (i byteArrayColumnIndex) NumPages() int { return 1 } 203 func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 } 204 func (i byteArrayColumnIndex) NullPage(int) bool { return false } 205 func (i byteArrayColumnIndex) MinValue(int) Value { return makeValueBytes(ByteArray, i.page.min()) } 206 func (i byteArrayColumnIndex) MaxValue(int) Value { return makeValueBytes(ByteArray, i.page.max()) } 207 func (i byteArrayColumnIndex) IsAscending() bool { return false } 208 func (i byteArrayColumnIndex) IsDescending() bool { return false } 209 210 type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage } 211 212 func (i fixedLenByteArrayColumnIndex) NumPages() int { return 1 } 213 func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 } 214 func (i fixedLenByteArrayColumnIndex) NullPage(int) bool { return false } 215 func (i fixedLenByteArrayColumnIndex) MinValue(int) Value { 216 return makeValueBytes(FixedLenByteArray, i.page.min()) 217 } 218 func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value { 219 return makeValueBytes(FixedLenByteArray, i.page.max()) 220 } 221 func (i fixedLenByteArrayColumnIndex) IsAscending() bool { return false } 222 func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false } 223 224 type uint32ColumnIndex struct{ page *uint32Page } 225 226 func (i uint32ColumnIndex) NumPages() int { return 1 } 227 func (i uint32ColumnIndex) NullCount(int) int64 { return 0 } 228 func (i uint32ColumnIndex) NullPage(int) bool { return false } 229 func (i uint32ColumnIndex) MinValue(int) Value { return makeValueUint32(i.page.min()) } 230 func (i uint32ColumnIndex) MaxValue(int) Value { return makeValueUint32(i.page.max()) } 231 func (i uint32ColumnIndex) IsAscending() bool { return false } 232 func (i uint32ColumnIndex) IsDescending() bool { return false } 233 234 type uint64ColumnIndex struct{ page *uint64Page } 235 236 func (i uint64ColumnIndex) NumPages() int { return 1 } 237 func (i uint64ColumnIndex) NullCount(int) int64 { return 0 } 238 func (i uint64ColumnIndex) NullPage(int) bool { return false } 239 func (i uint64ColumnIndex) MinValue(int) Value { return makeValueUint64(i.page.min()) } 240 func (i uint64ColumnIndex) MaxValue(int) Value { return makeValueUint64(i.page.max()) } 241 func (i uint64ColumnIndex) IsAscending() bool { return false } 242 func (i uint64ColumnIndex) IsDescending() bool { return false } 243 244 type be128ColumnIndex struct{ page *be128Page } 245 246 func (i be128ColumnIndex) NumPages() int { return 1 } 247 func (i be128ColumnIndex) NullCount(int) int64 { return 0 } 248 func (i be128ColumnIndex) NullPage(int) bool { return false } 249 func (i be128ColumnIndex) MinValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.min()) } 250 func (i be128ColumnIndex) MaxValue(int) Value { return makeValueBytes(FixedLenByteArray, i.page.max()) } 251 func (i be128ColumnIndex) IsAscending() bool { return false } 252 func (i be128ColumnIndex) IsDescending() bool { return false } 253 254 // The ColumnIndexer interface is implemented by types that support generating 255 // parquet column indexes. 256 // 257 // The package does not export any types that implement this interface, programs 258 // must call NewColumnIndexer on a Type instance to construct column indexers. 259 type ColumnIndexer interface { 260 // Resets the column indexer state. 261 Reset() 262 263 // Add a page to the column indexer. 264 IndexPage(numValues, numNulls int64, min, max Value) 265 266 // Generates a format.ColumnIndex value from the current state of the 267 // column indexer. 268 // 269 // The returned value may reference internal buffers, in which case the 270 // values remain valid until the next call to IndexPage or Reset on the 271 // column indexer. 272 ColumnIndex() format.ColumnIndex 273 } 274 275 type baseColumnIndexer struct { 276 nullPages []bool 277 nullCounts []int64 278 } 279 280 func (i *baseColumnIndexer) reset() { 281 i.nullPages = i.nullPages[:0] 282 i.nullCounts = i.nullCounts[:0] 283 } 284 285 func (i *baseColumnIndexer) observe(numValues, numNulls int64) { 286 i.nullPages = append(i.nullPages, numValues == numNulls) 287 i.nullCounts = append(i.nullCounts, numNulls) 288 } 289 290 func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex { 291 return format.ColumnIndex{ 292 NullPages: i.nullPages, 293 NullCounts: i.nullCounts, 294 MinValues: minValues, 295 MaxValues: maxValues, 296 BoundaryOrder: boundaryOrderOf(minOrder, maxOrder), 297 } 298 } 299 300 type booleanColumnIndexer struct { 301 baseColumnIndexer 302 minValues []bool 303 maxValues []bool 304 } 305 306 func newBooleanColumnIndexer() *booleanColumnIndexer { 307 return new(booleanColumnIndexer) 308 } 309 310 func (i *booleanColumnIndexer) Reset() { 311 i.reset() 312 i.minValues = i.minValues[:0] 313 i.maxValues = i.maxValues[:0] 314 } 315 316 func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 317 i.observe(numValues, numNulls) 318 i.minValues = append(i.minValues, min.boolean()) 319 i.maxValues = append(i.maxValues, max.boolean()) 320 } 321 322 func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex { 323 return i.columnIndex( 324 splitFixedLenByteArrays(unsafecast.BoolToBytes(i.minValues), 1), 325 splitFixedLenByteArrays(unsafecast.BoolToBytes(i.maxValues), 1), 326 orderOfBool(i.minValues), 327 orderOfBool(i.maxValues), 328 ) 329 } 330 331 type int32ColumnIndexer struct { 332 baseColumnIndexer 333 minValues []int32 334 maxValues []int32 335 } 336 337 func newInt32ColumnIndexer() *int32ColumnIndexer { 338 return new(int32ColumnIndexer) 339 } 340 341 func (i *int32ColumnIndexer) Reset() { 342 i.reset() 343 i.minValues = i.minValues[:0] 344 i.maxValues = i.maxValues[:0] 345 } 346 347 func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 348 i.observe(numValues, numNulls) 349 i.minValues = append(i.minValues, min.int32()) 350 i.maxValues = append(i.maxValues, max.int32()) 351 } 352 353 func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex { 354 return i.columnIndex( 355 splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.minValues), 4), 356 splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.maxValues), 4), 357 orderOfInt32(i.minValues), 358 orderOfInt32(i.maxValues), 359 ) 360 } 361 362 type int64ColumnIndexer struct { 363 baseColumnIndexer 364 minValues []int64 365 maxValues []int64 366 } 367 368 func newInt64ColumnIndexer() *int64ColumnIndexer { 369 return new(int64ColumnIndexer) 370 } 371 372 func (i *int64ColumnIndexer) Reset() { 373 i.reset() 374 i.minValues = i.minValues[:0] 375 i.maxValues = i.maxValues[:0] 376 } 377 378 func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 379 i.observe(numValues, numNulls) 380 i.minValues = append(i.minValues, min.int64()) 381 i.maxValues = append(i.maxValues, max.int64()) 382 } 383 384 func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex { 385 return i.columnIndex( 386 splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.minValues), 8), 387 splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.maxValues), 8), 388 orderOfInt64(i.minValues), 389 orderOfInt64(i.maxValues), 390 ) 391 } 392 393 type int96ColumnIndexer struct { 394 baseColumnIndexer 395 minValues []deprecated.Int96 396 maxValues []deprecated.Int96 397 } 398 399 func newInt96ColumnIndexer() *int96ColumnIndexer { 400 return new(int96ColumnIndexer) 401 } 402 403 func (i *int96ColumnIndexer) Reset() { 404 i.reset() 405 i.minValues = i.minValues[:0] 406 i.maxValues = i.maxValues[:0] 407 } 408 409 func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 410 i.observe(numValues, numNulls) 411 i.minValues = append(i.minValues, min.Int96()) 412 i.maxValues = append(i.maxValues, max.Int96()) 413 } 414 415 func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex { 416 return i.columnIndex( 417 splitFixedLenByteArrays(deprecated.Int96ToBytes(i.minValues), 12), 418 splitFixedLenByteArrays(deprecated.Int96ToBytes(i.maxValues), 12), 419 deprecated.OrderOfInt96(i.minValues), 420 deprecated.OrderOfInt96(i.maxValues), 421 ) 422 } 423 424 type floatColumnIndexer struct { 425 baseColumnIndexer 426 minValues []float32 427 maxValues []float32 428 } 429 430 func newFloatColumnIndexer() *floatColumnIndexer { 431 return new(floatColumnIndexer) 432 } 433 434 func (i *floatColumnIndexer) Reset() { 435 i.reset() 436 i.minValues = i.minValues[:0] 437 i.maxValues = i.maxValues[:0] 438 } 439 440 func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 441 i.observe(numValues, numNulls) 442 i.minValues = append(i.minValues, min.float()) 443 i.maxValues = append(i.maxValues, max.float()) 444 } 445 446 func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex { 447 return i.columnIndex( 448 splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.minValues), 4), 449 splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.maxValues), 4), 450 orderOfFloat32(i.minValues), 451 orderOfFloat32(i.maxValues), 452 ) 453 } 454 455 type doubleColumnIndexer struct { 456 baseColumnIndexer 457 minValues []float64 458 maxValues []float64 459 } 460 461 func newDoubleColumnIndexer() *doubleColumnIndexer { 462 return new(doubleColumnIndexer) 463 } 464 465 func (i *doubleColumnIndexer) Reset() { 466 i.reset() 467 i.minValues = i.minValues[:0] 468 i.maxValues = i.maxValues[:0] 469 } 470 471 func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 472 i.observe(numValues, numNulls) 473 i.minValues = append(i.minValues, min.double()) 474 i.maxValues = append(i.maxValues, max.double()) 475 } 476 477 func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex { 478 return i.columnIndex( 479 splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.minValues), 8), 480 splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.maxValues), 8), 481 orderOfFloat64(i.minValues), 482 orderOfFloat64(i.maxValues), 483 ) 484 } 485 486 type byteArrayColumnIndexer struct { 487 baseColumnIndexer 488 sizeLimit int 489 minValues []byte 490 maxValues []byte 491 } 492 493 func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer { 494 return &byteArrayColumnIndexer{sizeLimit: sizeLimit} 495 } 496 497 func (i *byteArrayColumnIndexer) Reset() { 498 i.reset() 499 i.minValues = i.minValues[:0] 500 i.maxValues = i.maxValues[:0] 501 } 502 503 func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 504 i.observe(numValues, numNulls) 505 i.minValues = plain.AppendByteArray(i.minValues, min.byteArray()) 506 i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray()) 507 } 508 509 func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { 510 minValues := splitByteArrays(i.minValues) 511 maxValues := splitByteArrays(i.maxValues) 512 if sizeLimit := i.sizeLimit; sizeLimit > 0 { 513 for i, v := range minValues { 514 minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) 515 } 516 for i, v := range maxValues { 517 maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) 518 } 519 } 520 return i.columnIndex( 521 minValues, 522 maxValues, 523 orderOfBytes(minValues), 524 orderOfBytes(maxValues), 525 ) 526 } 527 528 type fixedLenByteArrayColumnIndexer struct { 529 baseColumnIndexer 530 size int 531 sizeLimit int 532 minValues []byte 533 maxValues []byte 534 } 535 536 func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer { 537 return &fixedLenByteArrayColumnIndexer{ 538 size: size, 539 sizeLimit: sizeLimit, 540 } 541 } 542 543 func (i *fixedLenByteArrayColumnIndexer) Reset() { 544 i.reset() 545 i.minValues = i.minValues[:0] 546 i.maxValues = i.maxValues[:0] 547 } 548 549 func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 550 i.observe(numValues, numNulls) 551 i.minValues = append(i.minValues, min.byteArray()...) 552 i.maxValues = append(i.maxValues, max.byteArray()...) 553 } 554 555 func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex { 556 minValues := splitFixedLenByteArrays(i.minValues, i.size) 557 maxValues := splitFixedLenByteArrays(i.maxValues, i.size) 558 if sizeLimit := i.sizeLimit; sizeLimit > 0 { 559 for i, v := range minValues { 560 minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit) 561 } 562 for i, v := range maxValues { 563 maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit) 564 } 565 } 566 return i.columnIndex( 567 minValues, 568 maxValues, 569 orderOfBytes(minValues), 570 orderOfBytes(maxValues), 571 ) 572 } 573 574 type uint32ColumnIndexer struct { 575 baseColumnIndexer 576 minValues []uint32 577 maxValues []uint32 578 } 579 580 func newUint32ColumnIndexer() *uint32ColumnIndexer { 581 return new(uint32ColumnIndexer) 582 } 583 584 func (i *uint32ColumnIndexer) Reset() { 585 i.reset() 586 i.minValues = i.minValues[:0] 587 i.maxValues = i.maxValues[:0] 588 } 589 590 func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 591 i.observe(numValues, numNulls) 592 i.minValues = append(i.minValues, min.uint32()) 593 i.maxValues = append(i.maxValues, max.uint32()) 594 } 595 596 func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex { 597 return i.columnIndex( 598 splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.minValues), 4), 599 splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.maxValues), 4), 600 orderOfUint32(i.minValues), 601 orderOfUint32(i.maxValues), 602 ) 603 } 604 605 type uint64ColumnIndexer struct { 606 baseColumnIndexer 607 minValues []uint64 608 maxValues []uint64 609 } 610 611 func newUint64ColumnIndexer() *uint64ColumnIndexer { 612 return new(uint64ColumnIndexer) 613 } 614 615 func (i *uint64ColumnIndexer) Reset() { 616 i.reset() 617 i.minValues = i.minValues[:0] 618 i.maxValues = i.maxValues[:0] 619 } 620 621 func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 622 i.observe(numValues, numNulls) 623 i.minValues = append(i.minValues, min.uint64()) 624 i.maxValues = append(i.maxValues, max.uint64()) 625 } 626 627 func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex { 628 return i.columnIndex( 629 splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.minValues), 8), 630 splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.maxValues), 8), 631 orderOfUint64(i.minValues), 632 orderOfUint64(i.maxValues), 633 ) 634 } 635 636 type be128ColumnIndexer struct { 637 baseColumnIndexer 638 minValues [][16]byte 639 maxValues [][16]byte 640 } 641 642 func newBE128ColumnIndexer() *be128ColumnIndexer { 643 return new(be128ColumnIndexer) 644 } 645 646 func (i *be128ColumnIndexer) Reset() { 647 i.reset() 648 i.minValues = i.minValues[:0] 649 i.maxValues = i.maxValues[:0] 650 } 651 652 func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) { 653 i.observe(numValues, numNulls) 654 if !min.IsNull() { 655 i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray())) 656 } 657 if !max.IsNull() { 658 i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray())) 659 } 660 } 661 662 func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex { 663 minValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.minValues), 16) 664 maxValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.maxValues), 16) 665 return i.columnIndex( 666 minValues, 667 maxValues, 668 orderOfBytes(minValues), 669 orderOfBytes(maxValues), 670 ) 671 } 672 673 func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte { 674 if len(value) > sizeLimit { 675 value = value[:sizeLimit] 676 } 677 return value 678 } 679 680 // truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit. 681 // If the given byte array is truncated, it is incremented by 1 in place. 682 func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte { 683 if len(value) > sizeLimit { 684 value = value[:sizeLimit] 685 incrementByteArrayInplace(value) 686 } 687 return value 688 } 689 690 // incrementByteArray increments the given byte array by 1. 691 // Reference: https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124 692 func incrementByteArrayInplace(value []byte) { 693 for i := len(value) - 1; i >= 0; i-- { 694 value[i]++ 695 if value[i] != 0 { // Did not overflow: 0xFF -> 0x00 696 return 697 } 698 } 699 // Fully overflowed, so restore all to 0xFF 700 for i := range value { 701 value[i] = 0xFF 702 } 703 } 704 705 func splitByteArrays(data []byte) [][]byte { 706 length := 0 707 plain.RangeByteArray(data, func([]byte) error { 708 length++ 709 return nil 710 }) 711 buffer := make([]byte, 0, len(data)-(4*length)) 712 values := make([][]byte, 0, length) 713 plain.RangeByteArray(data, func(value []byte) error { 714 offset := len(buffer) 715 buffer = append(buffer, value...) 716 values = append(values, buffer[offset:]) 717 return nil 718 }) 719 return values 720 } 721 722 func splitFixedLenByteArrays(data []byte, size int) [][]byte { 723 data = copyBytes(data) 724 values := make([][]byte, len(data)/size) 725 for i := range values { 726 j := (i + 0) * size 727 k := (i + 1) * size 728 values[i] = data[j:k:k] 729 } 730 return values 731 } 732 733 func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder { 734 if minOrder == maxOrder { 735 switch { 736 case minOrder > 0: 737 return format.Ascending 738 case minOrder < 0: 739 return format.Descending 740 } 741 } 742 return format.Unordered 743 }