github.com/parquet-go/parquet-go@v0.20.0/column_test.go (about) 1 package parquet_test 2 3 import ( 4 "fmt" 5 "math/rand" 6 "testing" 7 8 "github.com/google/uuid" 9 10 "github.com/parquet-go/parquet-go" 11 "github.com/parquet-go/parquet-go/deprecated" 12 "github.com/parquet-go/parquet-go/format" 13 ) 14 15 func TestColumnPageIndex(t *testing.T) { 16 for _, config := range [...]struct { 17 name string 18 test func(*testing.T, rows) bool 19 }{ 20 { 21 name: "buffer", 22 test: testColumnPageIndexWithBuffer, 23 }, 24 { 25 name: "file", 26 test: testColumnPageIndexWithFile, 27 }, 28 } { 29 t.Run(config.name, func(t *testing.T) { 30 for _, test := range [...]struct { 31 scenario string 32 function func(*testing.T) interface{} 33 }{ 34 { 35 scenario: "boolean", 36 function: func(t *testing.T) interface{} { 37 return func(rows []struct{ Value bool }) bool { return config.test(t, makeRows(rows)) } 38 }, 39 }, 40 41 { 42 scenario: "int32", 43 function: func(t *testing.T) interface{} { 44 return func(rows []struct{ Value int32 }) bool { return config.test(t, makeRows(rows)) } 45 }, 46 }, 47 48 { 49 scenario: "int64", 50 function: func(t *testing.T) interface{} { 51 return func(rows []struct{ Value int64 }) bool { return config.test(t, makeRows(rows)) } 52 }, 53 }, 54 55 { 56 scenario: "int96", 57 function: func(t *testing.T) interface{} { 58 return func(rows []struct{ Value deprecated.Int96 }) bool { return config.test(t, makeRows(rows)) } 59 }, 60 }, 61 62 { 63 scenario: "uint32", 64 function: func(t *testing.T) interface{} { 65 return func(rows []struct{ Value uint32 }) bool { return config.test(t, makeRows(rows)) } 66 }, 67 }, 68 69 { 70 scenario: "uint64", 71 function: func(t *testing.T) interface{} { 72 return func(rows []struct{ Value uint64 }) bool { return config.test(t, makeRows(rows)) } 73 }, 74 }, 75 76 { 77 scenario: "float32", 78 function: func(t *testing.T) interface{} { 79 return func(rows []struct{ Value float32 }) bool { return config.test(t, makeRows(rows)) } 80 }, 81 }, 82 83 { 84 scenario: "float64", 85 function: func(t *testing.T) interface{} { 86 return func(rows []struct{ Value float64 }) bool { return config.test(t, makeRows(rows)) } 87 }, 88 }, 89 90 { 91 scenario: "string", 92 function: func(t *testing.T) interface{} { 93 return func(rows []struct{ Value string }) bool { return config.test(t, makeRows(rows)) } 94 }, 95 }, 96 97 { 98 scenario: "uuid", 99 function: func(t *testing.T) interface{} { 100 return func(rows []struct{ Value uuid.UUID }) bool { return config.test(t, makeRows(rows)) } 101 }, 102 }, 103 } { 104 t.Run(test.scenario, func(t *testing.T) { 105 if err := quickCheck(test.function(t)); err != nil { 106 t.Error(err) 107 } 108 }) 109 } 110 }) 111 } 112 } 113 114 func testColumnPageIndexWithBuffer(t *testing.T, rows rows) bool { 115 if len(rows) > 0 { 116 b := parquet.NewBuffer() 117 for _, row := range rows { 118 b.Write(row) 119 } 120 if err := checkRowGroupColumnIndex(b); err != nil { 121 t.Error(err) 122 return false 123 } 124 if err := checkRowGroupOffsetIndex(b); err != nil { 125 t.Error(err) 126 return false 127 } 128 } 129 return true 130 } 131 132 func checkRowGroupColumnIndex(rowGroup parquet.RowGroup) error { 133 for i, column := range rowGroup.ColumnChunks() { 134 if err := checkColumnChunkColumnIndex(column); err != nil { 135 return fmt.Errorf("column chunk @i=%d: %w", i, err) 136 } 137 } 138 return nil 139 } 140 141 func checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error { 142 columnType := columnChunk.Type() 143 columnIndex, _ := columnChunk.ColumnIndex() 144 numPages := columnIndex.NumPages() 145 pagesRead := 0 146 stats := newColumnStats(columnType) 147 pages := columnChunk.Pages() 148 defer pages.Close() 149 150 err := forEachPage(pages, func(page parquet.Page) error { 151 pageMin, pageMax, hasBounds := page.Bounds() 152 if !hasBounds { 153 return fmt.Errorf("page bounds are missing") 154 } 155 indexMin := columnIndex.MinValue(pagesRead) 156 indexMax := columnIndex.MaxValue(pagesRead) 157 158 if !parquet.Equal(pageMin, indexMin) { 159 return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMin, pageMin) 160 } 161 if !parquet.Equal(pageMax, indexMax) { 162 return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMax, pageMax) 163 } 164 165 numNulls := int64(0) 166 numValues := int64(0) 167 err := forEachValue(page.Values(), func(value parquet.Value) error { 168 stats.observe(value) 169 if value.IsNull() { 170 numNulls++ 171 } 172 numValues++ 173 return nil 174 }) 175 if err != nil { 176 return err 177 } 178 179 nullCount := columnIndex.NullCount(pagesRead) 180 if numNulls != nullCount { 181 return fmt.Errorf("number of null values mimatch: index=%d page=%d", nullCount, numNulls) 182 } 183 184 nullPage := columnIndex.NullPage(pagesRead) 185 if numNulls > 0 && numNulls == numValues && !nullPage { 186 return fmt.Errorf("page only contained null values but the index did not categorize it as a null page: nulls=%d", numNulls) 187 } 188 189 stats.pageRead() 190 pagesRead++ 191 return nil 192 }) 193 if err != nil { 194 return fmt.Errorf("page @i=%d: %w", pagesRead, err) 195 } 196 if pagesRead != numPages { 197 return fmt.Errorf("number of pages found in column index differs from the number of pages read: index=%d read=%d", numPages, pagesRead) 198 } 199 200 actualOrder := columnIndexOrder(columnIndex) 201 observedOrder := observedIndexOrder(columnType, stats.minValues, stats.maxValues) 202 xorAscending := (columnIndex.IsAscending() || observedOrder == ascendingIndexOrder) && 203 !(columnIndex.IsAscending() && observedOrder == ascendingIndexOrder) 204 xorDescending := (columnIndex.IsDescending() || observedOrder == descendingIndexOrder) && 205 !(columnIndex.IsDescending() && observedOrder == descendingIndexOrder) 206 207 if xorAscending || xorDescending { 208 return fmt.Errorf("column index is declared to be %s while observed values %s (min values %s, max values %s)", 209 actualOrder, 210 observedOrder, 211 valueOrder(columnType, stats.minValues), 212 valueOrder(columnType, stats.maxValues), 213 ) 214 } 215 216 return nil 217 } 218 219 func checkRowGroupOffsetIndex(rowGroup parquet.RowGroup) error { 220 for i, column := range rowGroup.ColumnChunks() { 221 if err := checkColumnChunkOffsetIndex(column); err != nil { 222 return fmt.Errorf("column chunk @i=%d: %w", i, err) 223 } 224 } 225 return nil 226 } 227 228 func checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error { 229 offsetIndex, _ := columnChunk.OffsetIndex() 230 numPages := offsetIndex.NumPages() 231 pagesRead := 0 232 rowIndex := int64(0) 233 234 pages := columnChunk.Pages() 235 defer pages.Close() 236 237 err := forEachPage(pages, func(page parquet.Page) error { 238 if firstRowIndex := offsetIndex.FirstRowIndex(pagesRead); firstRowIndex != rowIndex { 239 return fmt.Errorf("row number mismatch: index=%d page=%d", firstRowIndex, rowIndex) 240 } 241 rowIndex += int64(page.NumRows()) 242 pagesRead++ 243 return nil 244 }) 245 if err != nil { 246 return fmt.Errorf("page @i=%d: %w", pagesRead, err) 247 } 248 249 if pagesRead != numPages { 250 return fmt.Errorf("number of pages found in offset index differs from the number of pages read: index=%d read=%d", numPages, pagesRead) 251 } 252 253 return nil 254 } 255 256 func testColumnPageIndexWithFile(t *testing.T, rows rows) bool { 257 if len(rows) > 0 { 258 r := rand.New(rand.NewSource(5)) 259 f, err := createParquetFile(rows, 260 parquet.PageBufferSize(r.Intn(49)+1), 261 parquet.ColumnIndexSizeLimit(4096), 262 ) 263 if err != nil { 264 t.Error(err) 265 return false 266 } 267 if err := checkFileColumnIndex(f); err != nil { 268 t.Error(err) 269 return false 270 } 271 if err := checkFileOffsetIndex(f); err != nil { 272 t.Error(err) 273 return false 274 } 275 for i, rowGroup := range f.RowGroups() { 276 if err := checkRowGroupColumnIndex(rowGroup); err != nil { 277 t.Errorf("checking column index of row group @i=%d: %v", i, err) 278 return false 279 } 280 if err := checkRowGroupOffsetIndex(rowGroup); err != nil { 281 t.Errorf("checking offset index of row group @i=%d: %v", i, err) 282 return false 283 } 284 } 285 } 286 return true 287 } 288 289 func checkFileColumnIndex(f *parquet.File) error { 290 columnIndexes := f.ColumnIndexes() 291 i := 0 292 return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error { 293 columnIndex, _ := chunk.ColumnIndex() 294 if n := columnIndex.NumPages(); n <= 0 { 295 return fmt.Errorf("invalid number of pages found in the column index: %d", n) 296 } 297 if i >= len(columnIndexes) { 298 return fmt.Errorf("more column indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(columnIndexes)) 299 } 300 301 index1 := columnIndex 302 index2 := &fileColumnIndex{ 303 kind: col.Type().Kind(), 304 ColumnIndex: columnIndexes[i], 305 } 306 307 numPages1 := index1.NumPages() 308 numPages2 := index2.NumPages() 309 if numPages1 != numPages2 { 310 return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2) 311 } 312 313 for j := 0; j < numPages1; j++ { 314 nullCount1 := index1.NullCount(j) 315 nullCount2 := index2.NullCount(j) 316 if nullCount1 != nullCount2 { 317 return fmt.Errorf("null count of page %d/%d mismatch: got=%d want=%d", i, numPages1, nullCount1, nullCount2) 318 } 319 320 nullPage1 := index1.NullPage(j) 321 nullPage2 := index2.NullPage(j) 322 if nullPage1 != nullPage2 { 323 return fmt.Errorf("null page of page %d/%d mismatch: got=%t want=%t", i, numPages1, nullPage1, nullPage2) 324 } 325 326 minValue1 := index1.MinValue(j) 327 minValue2 := index2.MinValue(j) 328 if !parquet.Equal(minValue1, minValue2) { 329 return fmt.Errorf("min value of page %d/%d mismatch: got=%v want=%v", i, numPages1, minValue1, minValue2) 330 } 331 332 maxValue1 := index1.MaxValue(j) 333 maxValue2 := index2.MaxValue(j) 334 if !parquet.Equal(maxValue1, maxValue2) { 335 return fmt.Errorf("max value of page %d/%d mismatch: got=%v want=%v", i, numPages1, maxValue1, maxValue2) 336 } 337 338 isAscending1 := index1.IsAscending() 339 isAscending2 := index2.IsAscending() 340 if isAscending1 != isAscending2 { 341 return fmt.Errorf("ascending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isAscending1, isAscending2) 342 } 343 344 isDescending1 := index1.IsDescending() 345 isDescending2 := index2.IsDescending() 346 if isDescending1 != isDescending2 { 347 return fmt.Errorf("descending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isDescending1, isDescending2) 348 } 349 } 350 351 i++ 352 return nil 353 }) 354 } 355 356 func checkFileOffsetIndex(f *parquet.File) error { 357 offsetIndexes := f.OffsetIndexes() 358 i := 0 359 return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error { 360 offsetIndex, _ := chunk.OffsetIndex() 361 if n := offsetIndex.NumPages(); n <= 0 { 362 return fmt.Errorf("invalid number of pages found in the offset index: %d", n) 363 } 364 if i >= len(offsetIndexes) { 365 return fmt.Errorf("more offset indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(offsetIndexes)) 366 } 367 368 index1 := offsetIndex 369 index2 := (*fileOffsetIndex)(&offsetIndexes[i]) 370 371 numPages1 := index1.NumPages() 372 numPages2 := index2.NumPages() 373 if numPages1 != numPages2 { 374 return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2) 375 } 376 377 for j := 0; j < numPages1; j++ { 378 offset1 := index1.Offset(j) 379 offset2 := index2.Offset(j) 380 if offset1 != offset2 { 381 return fmt.Errorf("offsets of page %d/%d mismatch: got=%d want=%d", i, numPages1, offset1, offset2) 382 } 383 384 compressedPageSize1 := index1.CompressedPageSize(j) 385 compressedPageSize2 := index2.CompressedPageSize(j) 386 if compressedPageSize1 != compressedPageSize2 { 387 return fmt.Errorf("compressed page size of page %d/%d mismatch: got=%d want=%d", i, numPages1, compressedPageSize1, compressedPageSize2) 388 } 389 390 firstRowIndex1 := index1.FirstRowIndex(j) 391 firstRowIndex2 := index2.FirstRowIndex(j) 392 if firstRowIndex1 != firstRowIndex2 { 393 return fmt.Errorf("first row index of page %d/%d mismatch: got=%d want=%d", i, numPages1, firstRowIndex1, firstRowIndex2) 394 } 395 } 396 397 i++ 398 return nil 399 }) 400 } 401 402 type fileColumnIndex struct { 403 kind parquet.Kind 404 format.ColumnIndex 405 } 406 407 func (i *fileColumnIndex) NumPages() int { return len(i.NullPages) } 408 func (i *fileColumnIndex) NullCount(j int) int64 { return i.NullCounts[j] } 409 func (i *fileColumnIndex) NullPage(j int) bool { return i.NullPages[j] } 410 func (i *fileColumnIndex) MinValue(j int) parquet.Value { return i.kind.Value(i.MinValues[j]) } 411 func (i *fileColumnIndex) MaxValue(j int) parquet.Value { return i.kind.Value(i.MaxValues[j]) } 412 func (i *fileColumnIndex) IsAscending() bool { return i.BoundaryOrder == format.Ascending } 413 func (i *fileColumnIndex) IsDescending() bool { return i.BoundaryOrder == format.Descending } 414 415 type fileOffsetIndex format.OffsetIndex 416 417 func (i *fileOffsetIndex) NumPages() int { return len(i.PageLocations) } 418 func (i *fileOffsetIndex) Offset(j int) int64 { return i.PageLocations[j].Offset } 419 func (i *fileOffsetIndex) CompressedPageSize(j int) int64 { 420 return int64(i.PageLocations[j].CompressedPageSize) 421 } 422 func (i *fileOffsetIndex) FirstRowIndex(j int) int64 { return i.PageLocations[j].FirstRowIndex } 423 424 type columnStats struct { 425 page int 426 columnType parquet.Type 427 minValues []parquet.Value 428 maxValues []parquet.Value 429 } 430 431 func newColumnStats(columnType parquet.Type) *columnStats { 432 return &columnStats{columnType: columnType} 433 } 434 435 func (c *columnStats) observe(value parquet.Value) { 436 if c.page >= len(c.minValues) { 437 c.minValues = append(c.minValues, value.Clone()) 438 } else if c.columnType.Compare(c.minValues[c.page], value) > 0 { 439 c.minValues[c.page] = value.Clone() 440 } 441 442 if c.page >= len(c.maxValues) { 443 c.maxValues = append(c.maxValues, value.Clone()) 444 } else if c.columnType.Compare(c.maxValues[c.page], value) < 0 { 445 c.maxValues[c.page] = value.Clone() 446 } 447 } 448 449 func (c *columnStats) pageRead() { 450 c.page++ 451 } 452 453 type indexOrder int 454 455 const ( 456 invalidIndexOrder indexOrder = iota 457 unorderedIndexOrder 458 ascendingIndexOrder 459 descendingIndexOrder 460 ) 461 462 func (o indexOrder) String() string { 463 switch o { 464 case unorderedIndexOrder: 465 return "unordered" 466 case ascendingIndexOrder: 467 return "ascending" 468 case descendingIndexOrder: 469 return "descending" 470 default: 471 return "invalid" 472 } 473 } 474 475 func columnIndexOrder(index parquet.ColumnIndex) indexOrder { 476 switch { 477 case index.IsAscending() && index.IsDescending(): 478 return invalidIndexOrder 479 case index.IsAscending(): 480 return ascendingIndexOrder 481 case index.IsDescending(): 482 return descendingIndexOrder 483 default: 484 return unorderedIndexOrder 485 } 486 } 487 488 func observedIndexOrder(columnType parquet.Type, minValues []parquet.Value, maxValues []parquet.Value) indexOrder { 489 a := valueOrder(columnType, minValues) 490 b := valueOrder(columnType, maxValues) 491 492 switch { 493 case a == ascendingIndexOrder && b == ascendingIndexOrder: 494 return ascendingIndexOrder 495 case a == descendingIndexOrder && b == descendingIndexOrder: 496 return descendingIndexOrder 497 default: 498 return unorderedIndexOrder 499 } 500 } 501 502 func valueOrder(columnType parquet.Type, values []parquet.Value) indexOrder { 503 switch len(values) { 504 case 0, 1: 505 return unorderedIndexOrder 506 } 507 508 var order int 509 for i := 1; i < len(values); i++ { 510 next := columnType.Compare(values[i-1], values[i]) 511 if next == 0 { 512 continue 513 } 514 if order == 0 { 515 order = next 516 continue 517 } 518 if order != next { 519 return unorderedIndexOrder 520 } 521 } 522 523 if order > 0 { 524 return descendingIndexOrder 525 } 526 527 return ascendingIndexOrder 528 }