github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column_test.go (about) 1 package parquet_test 2 3 import ( 4 "bytes" 5 "fmt" 6 "math/rand" 7 "testing" 8 9 "github.com/google/uuid" 10 11 "github.com/parquet-go/parquet-go" 12 "github.com/parquet-go/parquet-go/deprecated" 13 "github.com/parquet-go/parquet-go/format" 14 ) 15 16 func TestColumnPageIndex(t *testing.T) { 17 for _, config := range [...]struct { 18 name string 19 test func(*testing.T, rows) bool 20 }{ 21 { 22 name: "buffer", 23 test: testColumnPageIndexWithBuffer, 24 }, 25 { 26 name: "file", 27 test: testColumnPageIndexWithFile, 28 }, 29 } { 30 t.Run(config.name, func(t *testing.T) { 31 for _, test := range [...]struct { 32 scenario string 33 function func(*testing.T) interface{} 34 }{ 35 { 36 scenario: "boolean", 37 function: func(t *testing.T) interface{} { 38 return func(rows []struct{ Value bool }) bool { return config.test(t, makeRows(rows)) } 39 }, 40 }, 41 42 { 43 scenario: "int32", 44 function: func(t *testing.T) interface{} { 45 return func(rows []struct{ Value int32 }) bool { return config.test(t, makeRows(rows)) } 46 }, 47 }, 48 49 { 50 scenario: "int64", 51 function: func(t *testing.T) interface{} { 52 return func(rows []struct{ Value int64 }) bool { return config.test(t, makeRows(rows)) } 53 }, 54 }, 55 56 { 57 scenario: "int96", 58 function: func(t *testing.T) interface{} { 59 return func(rows []struct{ Value deprecated.Int96 }) bool { return config.test(t, makeRows(rows)) } 60 }, 61 }, 62 63 { 64 scenario: "uint32", 65 function: func(t *testing.T) interface{} { 66 return func(rows []struct{ Value uint32 }) bool { return config.test(t, makeRows(rows)) } 67 }, 68 }, 69 70 { 71 scenario: "uint64", 72 function: func(t *testing.T) interface{} { 73 return func(rows []struct{ Value uint64 }) bool { return config.test(t, makeRows(rows)) } 74 }, 75 }, 76 77 { 78 scenario: "float32", 79 function: func(t *testing.T) interface{} { 80 return func(rows []struct{ Value float32 }) bool { return config.test(t, makeRows(rows)) } 81 }, 82 }, 83 84 { 85 scenario: "float64", 86 function: func(t *testing.T) interface{} { 87 return func(rows []struct{ Value float64 }) bool { return config.test(t, makeRows(rows)) } 88 }, 89 }, 90 91 { 92 scenario: "string", 93 function: func(t *testing.T) interface{} { 94 return func(rows []struct{ Value string }) bool { return config.test(t, makeRows(rows)) } 95 }, 96 }, 97 98 { 99 scenario: "uuid", 100 function: func(t *testing.T) interface{} { 101 return func(rows []struct{ Value uuid.UUID }) bool { return config.test(t, makeRows(rows)) } 102 }, 103 }, 104 } { 105 t.Run(test.scenario, func(t *testing.T) { 106 if err := quickCheck(test.function(t)); err != nil { 107 t.Error(err) 108 } 109 }) 110 } 111 }) 112 } 113 } 114 115 func testColumnPageIndexWithBuffer(t *testing.T, rows rows) bool { 116 if len(rows) > 0 { 117 b := parquet.NewBuffer() 118 for _, row := range rows { 119 b.Write(row) 120 } 121 if err := checkRowGroupColumnIndex(b); err != nil { 122 t.Error(err) 123 return false 124 } 125 if err := checkRowGroupOffsetIndex(b); err != nil { 126 t.Error(err) 127 return false 128 } 129 } 130 return true 131 } 132 133 func checkRowGroupColumnIndex(rowGroup parquet.RowGroup) error { 134 for i, column := range rowGroup.ColumnChunks() { 135 if err := checkColumnChunkColumnIndex(column); err != nil { 136 return fmt.Errorf("column chunk @i=%d: %w", i, err) 137 } 138 } 139 return nil 140 } 141 142 func checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error { 143 columnType := columnChunk.Type() 144 columnIndex, _ := columnChunk.ColumnIndex() 145 numPages := columnIndex.NumPages() 146 pagesRead := 0 147 stats := newColumnStats(columnType) 148 pages := columnChunk.Pages() 149 defer pages.Close() 150 151 err := forEachPage(pages, func(page parquet.Page) error { 152 pageMin, pageMax, hasBounds := page.Bounds() 153 if !hasBounds { 154 return fmt.Errorf("page bounds are missing") 155 } 156 indexMin := columnIndex.MinValue(pagesRead) 157 indexMax := columnIndex.MaxValue(pagesRead) 158 159 if !parquet.Equal(pageMin, indexMin) { 160 return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMin, pageMin) 161 } 162 if !parquet.Equal(pageMax, indexMax) { 163 return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMax, pageMax) 164 } 165 166 numNulls := int64(0) 167 numValues := int64(0) 168 err := forEachValue(page.Values(), func(value parquet.Value) error { 169 stats.observe(value) 170 if value.IsNull() { 171 numNulls++ 172 } 173 numValues++ 174 return nil 175 }) 176 if err != nil { 177 return err 178 } 179 180 nullCount := columnIndex.NullCount(pagesRead) 181 if numNulls != nullCount { 182 return fmt.Errorf("number of null values mimatch: index=%d page=%d", nullCount, numNulls) 183 } 184 185 nullPage := columnIndex.NullPage(pagesRead) 186 if numNulls > 0 && numNulls == numValues && !nullPage { 187 return fmt.Errorf("page only contained null values but the index did not categorize it as a null page: nulls=%d", numNulls) 188 } 189 190 stats.pageRead() 191 pagesRead++ 192 return nil 193 }) 194 if err != nil { 195 return fmt.Errorf("page @i=%d: %w", pagesRead, err) 196 } 197 if pagesRead != numPages { 198 return fmt.Errorf("number of pages found in column index differs from the number of pages read: index=%d read=%d", numPages, pagesRead) 199 } 200 201 actualOrder := columnIndexOrder(columnIndex) 202 observedOrder := observedIndexOrder(columnType, stats.minValues, stats.maxValues) 203 xorAscending := (columnIndex.IsAscending() || observedOrder == ascendingIndexOrder) && 204 !(columnIndex.IsAscending() && observedOrder == ascendingIndexOrder) 205 xorDescending := (columnIndex.IsDescending() || observedOrder == descendingIndexOrder) && 206 !(columnIndex.IsDescending() && observedOrder == descendingIndexOrder) 207 208 if xorAscending || xorDescending { 209 return fmt.Errorf("column index is declared to be %s while observed values %s (min values %s, max values %s)", 210 actualOrder, 211 observedOrder, 212 valueOrder(columnType, stats.minValues), 213 valueOrder(columnType, stats.maxValues), 214 ) 215 } 216 217 return nil 218 } 219 220 func checkRowGroupOffsetIndex(rowGroup parquet.RowGroup) error { 221 for i, column := range rowGroup.ColumnChunks() { 222 if err := checkColumnChunkOffsetIndex(column); err != nil { 223 return fmt.Errorf("column chunk @i=%d: %w", i, err) 224 } 225 } 226 return nil 227 } 228 229 func checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error { 230 offsetIndex, _ := columnChunk.OffsetIndex() 231 numPages := offsetIndex.NumPages() 232 pagesRead := 0 233 rowIndex := int64(0) 234 235 pages := columnChunk.Pages() 236 defer pages.Close() 237 238 err := forEachPage(pages, func(page parquet.Page) error { 239 if firstRowIndex := offsetIndex.FirstRowIndex(pagesRead); firstRowIndex != rowIndex { 240 return fmt.Errorf("row number mismatch: index=%d page=%d", firstRowIndex, rowIndex) 241 } 242 rowIndex += int64(page.NumRows()) 243 pagesRead++ 244 return nil 245 }) 246 if err != nil { 247 return fmt.Errorf("page @i=%d: %w", pagesRead, err) 248 } 249 250 if pagesRead != numPages { 251 return fmt.Errorf("number of pages found in offset index differs from the number of pages read: index=%d read=%d", numPages, pagesRead) 252 } 253 254 return nil 255 } 256 257 func testColumnPageIndexWithFile(t *testing.T, rows rows) bool { 258 if len(rows) > 0 { 259 r := rand.New(rand.NewSource(5)) 260 f, err := createParquetFile(rows, 261 parquet.PageBufferSize(r.Intn(49)+1), 262 parquet.ColumnIndexSizeLimit(4096), 263 ) 264 if err != nil { 265 t.Error(err) 266 return false 267 } 268 if err := checkFileColumnIndex(f); err != nil { 269 t.Error(err) 270 return false 271 } 272 if err := checkFileOffsetIndex(f); err != nil { 273 t.Error(err) 274 return false 275 } 276 for i, rowGroup := range f.RowGroups() { 277 if err := checkRowGroupColumnIndex(rowGroup); err != nil { 278 t.Errorf("checking column index of row group @i=%d: %v", i, err) 279 return false 280 } 281 if err := checkRowGroupOffsetIndex(rowGroup); err != nil { 282 t.Errorf("checking offset index of row group @i=%d: %v", i, err) 283 return false 284 } 285 } 286 } 287 return true 288 } 289 290 func checkFileColumnIndex(f *parquet.File) error { 291 columnIndexes := f.ColumnIndexes() 292 i := 0 293 return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error { 294 columnIndex, _ := chunk.ColumnIndex() 295 if n := columnIndex.NumPages(); n <= 0 { 296 return fmt.Errorf("invalid number of pages found in the column index: %d", n) 297 } 298 if i >= len(columnIndexes) { 299 return fmt.Errorf("more column indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(columnIndexes)) 300 } 301 302 index1 := columnIndex 303 index2 := &fileColumnIndex{ 304 kind: col.Type().Kind(), 305 ColumnIndex: columnIndexes[i], 306 } 307 308 numPages1 := index1.NumPages() 309 numPages2 := index2.NumPages() 310 if numPages1 != numPages2 { 311 return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2) 312 } 313 314 for j := 0; j < numPages1; j++ { 315 nullCount1 := index1.NullCount(j) 316 nullCount2 := index2.NullCount(j) 317 if nullCount1 != nullCount2 { 318 return fmt.Errorf("null count of page %d/%d mismatch: got=%d want=%d", i, numPages1, nullCount1, nullCount2) 319 } 320 321 nullPage1 := index1.NullPage(j) 322 nullPage2 := index2.NullPage(j) 323 if nullPage1 != nullPage2 { 324 return fmt.Errorf("null page of page %d/%d mismatch: got=%t want=%t", i, numPages1, nullPage1, nullPage2) 325 } 326 327 minValue1 := index1.MinValue(j) 328 minValue2 := index2.MinValue(j) 329 if !parquet.Equal(minValue1, minValue2) { 330 return fmt.Errorf("min value of page %d/%d mismatch: got=%v want=%v", i, numPages1, minValue1, minValue2) 331 } 332 333 maxValue1 := index1.MaxValue(j) 334 maxValue2 := index2.MaxValue(j) 335 if !parquet.Equal(maxValue1, maxValue2) { 336 return fmt.Errorf("max value of page %d/%d mismatch: got=%v want=%v", i, numPages1, maxValue1, maxValue2) 337 } 338 339 isAscending1 := index1.IsAscending() 340 isAscending2 := index2.IsAscending() 341 if isAscending1 != isAscending2 { 342 return fmt.Errorf("ascending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isAscending1, isAscending2) 343 } 344 345 isDescending1 := index1.IsDescending() 346 isDescending2 := index2.IsDescending() 347 if isDescending1 != isDescending2 { 348 return fmt.Errorf("descending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isDescending1, isDescending2) 349 } 350 } 351 352 i++ 353 return nil 354 }) 355 } 356 357 func checkFileOffsetIndex(f *parquet.File) error { 358 offsetIndexes := f.OffsetIndexes() 359 i := 0 360 return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error { 361 offsetIndex, _ := chunk.OffsetIndex() 362 if n := offsetIndex.NumPages(); n <= 0 { 363 return fmt.Errorf("invalid number of pages found in the offset index: %d", n) 364 } 365 if i >= len(offsetIndexes) { 366 return fmt.Errorf("more offset indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(offsetIndexes)) 367 } 368 369 index1 := offsetIndex 370 index2 := (*fileOffsetIndex)(&offsetIndexes[i]) 371 372 numPages1 := index1.NumPages() 373 numPages2 := index2.NumPages() 374 if numPages1 != numPages2 { 375 return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2) 376 } 377 378 for j := 0; j < numPages1; j++ { 379 offset1 := index1.Offset(j) 380 offset2 := index2.Offset(j) 381 if offset1 != offset2 { 382 return fmt.Errorf("offsets of page %d/%d mismatch: got=%d want=%d", i, numPages1, offset1, offset2) 383 } 384 385 compressedPageSize1 := index1.CompressedPageSize(j) 386 compressedPageSize2 := index2.CompressedPageSize(j) 387 if compressedPageSize1 != compressedPageSize2 { 388 return fmt.Errorf("compressed page size of page %d/%d mismatch: got=%d want=%d", i, numPages1, compressedPageSize1, compressedPageSize2) 389 } 390 391 firstRowIndex1 := index1.FirstRowIndex(j) 392 firstRowIndex2 := index2.FirstRowIndex(j) 393 if firstRowIndex1 != firstRowIndex2 { 394 return fmt.Errorf("first row index of page %d/%d mismatch: got=%d want=%d", i, numPages1, firstRowIndex1, firstRowIndex2) 395 } 396 } 397 398 i++ 399 return nil 400 }) 401 } 402 403 type fileColumnIndex struct { 404 kind parquet.Kind 405 format.ColumnIndex 406 } 407 408 func (i *fileColumnIndex) NumPages() int { return len(i.NullPages) } 409 func (i *fileColumnIndex) NullCount(j int) int64 { return i.NullCounts[j] } 410 func (i *fileColumnIndex) NullPage(j int) bool { return i.NullPages[j] } 411 func (i *fileColumnIndex) MinValue(j int) parquet.Value { return i.kind.Value(i.MinValues[j]) } 412 func (i *fileColumnIndex) MaxValue(j int) parquet.Value { return i.kind.Value(i.MaxValues[j]) } 413 func (i *fileColumnIndex) IsAscending() bool { return i.BoundaryOrder == format.Ascending } 414 func (i *fileColumnIndex) IsDescending() bool { return i.BoundaryOrder == format.Descending } 415 416 type fileOffsetIndex format.OffsetIndex 417 418 func (i *fileOffsetIndex) NumPages() int { return len(i.PageLocations) } 419 func (i *fileOffsetIndex) Offset(j int) int64 { return i.PageLocations[j].Offset } 420 func (i *fileOffsetIndex) CompressedPageSize(j int) int64 { 421 return int64(i.PageLocations[j].CompressedPageSize) 422 } 423 func (i *fileOffsetIndex) FirstRowIndex(j int) int64 { return i.PageLocations[j].FirstRowIndex } 424 425 type columnStats struct { 426 page int 427 columnType parquet.Type 428 minValues []parquet.Value 429 maxValues []parquet.Value 430 } 431 432 func newColumnStats(columnType parquet.Type) *columnStats { 433 return &columnStats{columnType: columnType} 434 } 435 436 func (c *columnStats) observe(value parquet.Value) { 437 if c.page >= len(c.minValues) { 438 c.minValues = append(c.minValues, value.Clone()) 439 } else if c.columnType.Compare(c.minValues[c.page], value) > 0 { 440 c.minValues[c.page] = value.Clone() 441 } 442 443 if c.page >= len(c.maxValues) { 444 c.maxValues = append(c.maxValues, value.Clone()) 445 } else if c.columnType.Compare(c.maxValues[c.page], value) < 0 { 446 c.maxValues[c.page] = value.Clone() 447 } 448 } 449 450 func (c *columnStats) pageRead() { 451 c.page++ 452 } 453 454 type indexOrder int 455 456 const ( 457 invalidIndexOrder indexOrder = iota 458 unorderedIndexOrder 459 ascendingIndexOrder 460 descendingIndexOrder 461 ) 462 463 func (o indexOrder) String() string { 464 switch o { 465 case unorderedIndexOrder: 466 return "unordered" 467 case ascendingIndexOrder: 468 return "ascending" 469 case descendingIndexOrder: 470 return "descending" 471 default: 472 return "invalid" 473 } 474 } 475 476 func columnIndexOrder(index parquet.ColumnIndex) indexOrder { 477 switch { 478 case index.IsAscending() && index.IsDescending(): 479 return invalidIndexOrder 480 case index.IsAscending(): 481 return ascendingIndexOrder 482 case index.IsDescending(): 483 return descendingIndexOrder 484 default: 485 return unorderedIndexOrder 486 } 487 } 488 489 func observedIndexOrder(columnType parquet.Type, minValues []parquet.Value, maxValues []parquet.Value) indexOrder { 490 a := valueOrder(columnType, minValues) 491 b := valueOrder(columnType, maxValues) 492 493 switch { 494 case a == ascendingIndexOrder && b == ascendingIndexOrder: 495 return ascendingIndexOrder 496 case a == descendingIndexOrder && b == descendingIndexOrder: 497 return descendingIndexOrder 498 default: 499 return unorderedIndexOrder 500 } 501 } 502 503 func valueOrder(columnType parquet.Type, values []parquet.Value) indexOrder { 504 switch len(values) { 505 case 0, 1: 506 return unorderedIndexOrder 507 } 508 509 var order int 510 for i := 1; i < len(values); i++ { 511 next := columnType.Compare(values[i-1], values[i]) 512 if next == 0 { 513 continue 514 } 515 if order == 0 { 516 order = next 517 continue 518 } 519 if order != next { 520 return unorderedIndexOrder 521 } 522 } 523 524 if order > 0 { 525 return descendingIndexOrder 526 } 527 528 return ascendingIndexOrder 529 } 530 531 func TestColumnPages_SeekToRow(t *testing.T) { 532 type Contact struct { 533 ID int64 `parquet:"id"` 534 Name string `parquet:"name"` 535 Sex bool `parquet:"sex"` 536 } 537 538 buf := bytes.Buffer{} 539 writer := parquet.NewWriter(&buf) 540 data := [][]Contact{ 541 { 542 {ID: 1, Name: "user1"}, 543 {ID: 2, Name: "user2"}, 544 {ID: 7, Name: "user7"}, 545 }, 546 { 547 {ID: 8, Name: "user8"}, 548 {ID: 10, Name: "user10"}, 549 {ID: 12, Name: "user12"}, 550 }, 551 { 552 {ID: 15, Name: "user15"}, 553 {ID: 16, Name: "user16"}, 554 }, 555 } 556 for _, rows := range data { 557 for _, row := range rows { 558 err := writer.Write(&row) 559 if err != nil { 560 panic(err) 561 } 562 } 563 err := writer.Flush() 564 if err != nil { 565 panic(err) 566 } 567 } 568 err := writer.Close() 569 if err != nil { 570 panic(err) 571 } 572 573 pr, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len())) 574 if err != nil { 575 t.Error(err) 576 } 577 578 id := pr.Root().Column("id") 579 580 pages := id.Pages() 581 defer pages.Close() 582 583 var idx int64 584 for _, rows := range data { 585 for _, row := range rows { 586 err := pages.SeekToRow(idx) 587 if err != nil { 588 t.Error(err) 589 } 590 591 page, err := pages.ReadPage() 592 if err != nil { 593 t.Error(err) 594 } 595 596 var values [1]int64 597 page.Values().(interface { 598 ReadInt64s(values []int64) (n int, err error) 599 }).ReadInt64s(values[:]) 600 601 if values[0] != row.ID { 602 t.Errorf("read value of page mismatch, row index %d: got=%d want=%d", idx, values[0], row.ID) 603 } 604 605 idx++ 606 } 607 } 608 }