github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/column_test.go (about) 1 package parquet_test 2 3 import ( 4 "fmt" 5 "math/rand" 6 "testing" 7 8 "github.com/google/uuid" 9 "github.com/segmentio/parquet-go" 10 "github.com/segmentio/parquet-go/deprecated" 11 "github.com/segmentio/parquet-go/format" 12 ) 13 14 func TestColumnPageIndex(t *testing.T) { 15 for _, config := range [...]struct { 16 name string 17 test func(*testing.T, rows) bool 18 }{ 19 { 20 name: "buffer", 21 test: testColumnPageIndexWithBuffer, 22 }, 23 { 24 name: "file", 25 test: testColumnPageIndexWithFile, 26 }, 27 } { 28 t.Run(config.name, func(t *testing.T) { 29 for _, test := range [...]struct { 30 scenario string 31 function func(*testing.T) interface{} 32 }{ 33 { 34 scenario: "boolean", 35 function: func(t *testing.T) interface{} { 36 return func(rows []struct{ Value bool }) bool { return config.test(t, makeRows(rows)) } 37 }, 38 }, 39 40 { 41 scenario: "int32", 42 function: func(t *testing.T) interface{} { 43 return func(rows []struct{ Value int32 }) bool { return config.test(t, makeRows(rows)) } 44 }, 45 }, 46 47 { 48 scenario: "int64", 49 function: func(t *testing.T) interface{} { 50 return func(rows []struct{ Value int64 }) bool { return config.test(t, makeRows(rows)) } 51 }, 52 }, 53 54 { 55 scenario: "int96", 56 function: func(t *testing.T) interface{} { 57 return func(rows []struct{ Value deprecated.Int96 }) bool { return config.test(t, makeRows(rows)) } 58 }, 59 }, 60 61 { 62 scenario: "uint32", 63 function: func(t *testing.T) interface{} { 64 return func(rows []struct{ Value uint32 }) bool { return config.test(t, makeRows(rows)) } 65 }, 66 }, 67 68 { 69 scenario: "uint64", 70 function: func(t *testing.T) interface{} { 71 return func(rows []struct{ Value uint64 }) bool { return config.test(t, makeRows(rows)) } 72 }, 73 }, 74 75 { 76 scenario: "float32", 77 function: func(t *testing.T) interface{} { 78 return func(rows []struct{ Value float32 }) bool { return config.test(t, makeRows(rows)) } 79 }, 80 }, 81 82 { 83 scenario: "float64", 84 function: func(t *testing.T) interface{} { 85 return func(rows []struct{ Value float64 }) bool { return config.test(t, makeRows(rows)) } 86 }, 87 }, 88 89 { 90 scenario: "string", 91 function: func(t *testing.T) interface{} { 92 return func(rows []struct{ Value string }) bool { return config.test(t, makeRows(rows)) } 93 }, 94 }, 95 96 { 97 scenario: "uuid", 98 function: func(t *testing.T) interface{} { 99 return func(rows []struct{ Value uuid.UUID }) bool { return config.test(t, makeRows(rows)) } 100 }, 101 }, 102 } { 103 t.Run(test.scenario, func(t *testing.T) { 104 if err := quickCheck(test.function(t)); err != nil { 105 t.Error(err) 106 } 107 }) 108 } 109 }) 110 } 111 } 112 113 func testColumnPageIndexWithBuffer(t *testing.T, rows rows) bool { 114 if len(rows) > 0 { 115 b := parquet.NewBuffer() 116 for _, row := range rows { 117 b.Write(row) 118 } 119 if err := checkRowGroupColumnIndex(b); err != nil { 120 t.Error(err) 121 return false 122 } 123 if err := checkRowGroupOffsetIndex(b); err != nil { 124 t.Error(err) 125 return false 126 } 127 } 128 return true 129 } 130 131 func checkRowGroupColumnIndex(rowGroup parquet.RowGroup) error { 132 for i, column := range rowGroup.ColumnChunks() { 133 if err := checkColumnChunkColumnIndex(column); err != nil { 134 return fmt.Errorf("column chunk @i=%d: %w", i, err) 135 } 136 } 137 return nil 138 } 139 140 func checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error { 141 columnType := columnChunk.Type() 142 columnIndex := columnChunk.ColumnIndex() 143 numPages := columnIndex.NumPages() 144 pagesRead := 0 145 stats := newColumnStats(columnType) 146 pages := columnChunk.Pages() 147 defer pages.Close() 148 149 err := forEachPage(pages, func(page parquet.Page) error { 150 pageMin, pageMax, hasBounds := page.Bounds() 151 if !hasBounds { 152 return fmt.Errorf("page bounds are missing") 153 } 154 indexMin := columnIndex.MinValue(pagesRead) 155 indexMax := columnIndex.MaxValue(pagesRead) 156 157 if !parquet.Equal(pageMin, indexMin) { 158 return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMin, pageMin) 159 } 160 if !parquet.Equal(pageMax, indexMax) { 161 return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMax, pageMax) 162 } 163 164 numNulls := int64(0) 165 numValues := int64(0) 166 err := forEachValue(page.Values(), func(value parquet.Value) error { 167 stats.observe(value) 168 if value.IsNull() { 169 numNulls++ 170 } 171 numValues++ 172 return nil 173 }) 174 if err != nil { 175 return err 176 } 177 178 nullCount := columnIndex.NullCount(pagesRead) 179 if numNulls != nullCount { 180 return fmt.Errorf("number of null values mimatch: index=%d page=%d", nullCount, numNulls) 181 } 182 183 nullPage := columnIndex.NullPage(pagesRead) 184 if numNulls > 0 && numNulls == numValues && !nullPage { 185 return fmt.Errorf("page only contained null values but the index did not categorize it as a null page: nulls=%d", numNulls) 186 } 187 188 stats.pageRead() 189 pagesRead++ 190 return nil 191 }) 192 if err != nil { 193 return fmt.Errorf("page @i=%d: %w", pagesRead, err) 194 } 195 if pagesRead != numPages { 196 return fmt.Errorf("number of pages found in column index differs from the number of pages read: index=%d read=%d", numPages, pagesRead) 197 } 198 199 actualOrder := columnIndexOrder(columnIndex) 200 observedOrder := observedIndexOrder(columnType, stats.minValues, stats.maxValues) 201 xorAscending := (columnIndex.IsAscending() || observedOrder == ascendingIndexOrder) && 202 !(columnIndex.IsAscending() && observedOrder == ascendingIndexOrder) 203 xorDescending := (columnIndex.IsDescending() || observedOrder == descendingIndexOrder) && 204 !(columnIndex.IsDescending() && observedOrder == descendingIndexOrder) 205 206 if xorAscending || xorDescending { 207 return fmt.Errorf("column index is declared to be %s while observed values %s (min values %s, max values %s)", 208 actualOrder, 209 observedOrder, 210 valueOrder(columnType, stats.minValues), 211 valueOrder(columnType, stats.maxValues), 212 ) 213 } 214 215 return nil 216 } 217 218 func checkRowGroupOffsetIndex(rowGroup parquet.RowGroup) error { 219 for i, column := range rowGroup.ColumnChunks() { 220 if err := checkColumnChunkOffsetIndex(column); err != nil { 221 return fmt.Errorf("column chunk @i=%d: %w", i, err) 222 } 223 } 224 return nil 225 } 226 227 func checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error { 228 offsetIndex := columnChunk.OffsetIndex() 229 numPages := offsetIndex.NumPages() 230 pagesRead := 0 231 rowIndex := int64(0) 232 233 pages := columnChunk.Pages() 234 defer pages.Close() 235 236 err := forEachPage(pages, func(page parquet.Page) error { 237 if firstRowIndex := offsetIndex.FirstRowIndex(pagesRead); firstRowIndex != rowIndex { 238 return fmt.Errorf("row number mismatch: index=%d page=%d", firstRowIndex, rowIndex) 239 } 240 rowIndex += int64(page.NumRows()) 241 pagesRead++ 242 return nil 243 }) 244 if err != nil { 245 return fmt.Errorf("page @i=%d: %w", pagesRead, err) 246 } 247 248 if pagesRead != numPages { 249 return fmt.Errorf("number of pages found in offset index differs from the number of pages read: index=%d read=%d", numPages, pagesRead) 250 } 251 252 return nil 253 } 254 255 func testColumnPageIndexWithFile(t *testing.T, rows rows) bool { 256 if len(rows) > 0 { 257 r := rand.New(rand.NewSource(5)) 258 f, err := createParquetFile(rows, 259 parquet.PageBufferSize(r.Intn(49)+1), 260 parquet.ColumnIndexSizeLimit(4096), 261 ) 262 if err != nil { 263 t.Error(err) 264 return false 265 } 266 if err := checkFileColumnIndex(f); err != nil { 267 t.Error(err) 268 return false 269 } 270 if err := checkFileOffsetIndex(f); err != nil { 271 t.Error(err) 272 return false 273 } 274 for i, rowGroup := range f.RowGroups() { 275 if err := checkRowGroupColumnIndex(rowGroup); err != nil { 276 t.Errorf("checking column index of row group @i=%d: %v", i, err) 277 return false 278 } 279 if err := checkRowGroupOffsetIndex(rowGroup); err != nil { 280 t.Errorf("checking offset index of row group @i=%d: %v", i, err) 281 return false 282 } 283 } 284 } 285 return true 286 } 287 288 func checkFileColumnIndex(f *parquet.File) error { 289 columnIndexes := f.ColumnIndexes() 290 i := 0 291 return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error { 292 columnIndex := chunk.ColumnIndex() 293 if n := columnIndex.NumPages(); n <= 0 { 294 return fmt.Errorf("invalid number of pages found in the column index: %d", n) 295 } 296 if i >= len(columnIndexes) { 297 return fmt.Errorf("more column indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(columnIndexes)) 298 } 299 300 index1 := columnIndex 301 index2 := &fileColumnIndex{ 302 kind: col.Type().Kind(), 303 ColumnIndex: columnIndexes[i], 304 } 305 306 numPages1 := index1.NumPages() 307 numPages2 := index2.NumPages() 308 if numPages1 != numPages2 { 309 return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2) 310 } 311 312 for j := 0; j < numPages1; j++ { 313 nullCount1 := index1.NullCount(j) 314 nullCount2 := index2.NullCount(j) 315 if nullCount1 != nullCount2 { 316 return fmt.Errorf("null count of page %d/%d mismatch: got=%d want=%d", i, numPages1, nullCount1, nullCount2) 317 } 318 319 nullPage1 := index1.NullPage(j) 320 nullPage2 := index2.NullPage(j) 321 if nullPage1 != nullPage2 { 322 return fmt.Errorf("null page of page %d/%d mismatch: got=%t want=%t", i, numPages1, nullPage1, nullPage2) 323 } 324 325 minValue1 := index1.MinValue(j) 326 minValue2 := index2.MinValue(j) 327 if !parquet.Equal(minValue1, minValue2) { 328 return fmt.Errorf("min value of page %d/%d mismatch: got=%v want=%v", i, numPages1, minValue1, minValue2) 329 } 330 331 maxValue1 := index1.MaxValue(j) 332 maxValue2 := index2.MaxValue(j) 333 if !parquet.Equal(maxValue1, maxValue2) { 334 return fmt.Errorf("max value of page %d/%d mismatch: got=%v want=%v", i, numPages1, maxValue1, maxValue2) 335 } 336 337 isAscending1 := index1.IsAscending() 338 isAscending2 := index2.IsAscending() 339 if isAscending1 != isAscending2 { 340 return fmt.Errorf("ascending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isAscending1, isAscending2) 341 } 342 343 isDescending1 := index1.IsDescending() 344 isDescending2 := index2.IsDescending() 345 if isDescending1 != isDescending2 { 346 return fmt.Errorf("descending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isDescending1, isDescending2) 347 } 348 } 349 350 i++ 351 return nil 352 }) 353 } 354 355 func checkFileOffsetIndex(f *parquet.File) error { 356 offsetIndexes := f.OffsetIndexes() 357 i := 0 358 return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error { 359 offsetIndex := chunk.OffsetIndex() 360 if n := offsetIndex.NumPages(); n <= 0 { 361 return fmt.Errorf("invalid number of pages found in the offset index: %d", n) 362 } 363 if i >= len(offsetIndexes) { 364 return fmt.Errorf("more offset indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(offsetIndexes)) 365 } 366 367 index1 := offsetIndex 368 index2 := (*fileOffsetIndex)(&offsetIndexes[i]) 369 370 numPages1 := index1.NumPages() 371 numPages2 := index2.NumPages() 372 if numPages1 != numPages2 { 373 return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2) 374 } 375 376 for j := 0; j < numPages1; j++ { 377 offset1 := index1.Offset(j) 378 offset2 := index2.Offset(j) 379 if offset1 != offset2 { 380 return fmt.Errorf("offsets of page %d/%d mismatch: got=%d want=%d", i, numPages1, offset1, offset2) 381 } 382 383 compressedPageSize1 := index1.CompressedPageSize(j) 384 compressedPageSize2 := index2.CompressedPageSize(j) 385 if compressedPageSize1 != compressedPageSize2 { 386 return fmt.Errorf("compressed page size of page %d/%d mismatch: got=%d want=%d", i, numPages1, compressedPageSize1, compressedPageSize2) 387 } 388 389 firstRowIndex1 := index1.FirstRowIndex(j) 390 firstRowIndex2 := index2.FirstRowIndex(j) 391 if firstRowIndex1 != firstRowIndex2 { 392 return fmt.Errorf("first row index of page %d/%d mismatch: got=%d want=%d", i, numPages1, firstRowIndex1, firstRowIndex2) 393 } 394 } 395 396 i++ 397 return nil 398 }) 399 } 400 401 type fileColumnIndex struct { 402 kind parquet.Kind 403 format.ColumnIndex 404 } 405 406 func (i *fileColumnIndex) NumPages() int { return len(i.NullPages) } 407 func (i *fileColumnIndex) NullCount(j int) int64 { return i.NullCounts[j] } 408 func (i *fileColumnIndex) NullPage(j int) bool { return i.NullPages[j] } 409 func (i *fileColumnIndex) MinValue(j int) parquet.Value { return i.kind.Value(i.MinValues[j]) } 410 func (i *fileColumnIndex) MaxValue(j int) parquet.Value { return i.kind.Value(i.MaxValues[j]) } 411 func (i *fileColumnIndex) IsAscending() bool { return i.BoundaryOrder == format.Ascending } 412 func (i *fileColumnIndex) IsDescending() bool { return i.BoundaryOrder == format.Descending } 413 414 type fileOffsetIndex format.OffsetIndex 415 416 func (i *fileOffsetIndex) NumPages() int { return len(i.PageLocations) } 417 func (i *fileOffsetIndex) Offset(j int) int64 { return i.PageLocations[j].Offset } 418 func (i *fileOffsetIndex) CompressedPageSize(j int) int64 { 419 return int64(i.PageLocations[j].CompressedPageSize) 420 } 421 func (i *fileOffsetIndex) FirstRowIndex(j int) int64 { return i.PageLocations[j].FirstRowIndex } 422 423 type columnStats struct { 424 page int 425 columnType parquet.Type 426 minValues []parquet.Value 427 maxValues []parquet.Value 428 } 429 430 func newColumnStats(columnType parquet.Type) *columnStats { 431 return &columnStats{columnType: columnType} 432 } 433 434 func (c *columnStats) observe(value parquet.Value) { 435 if c.page >= len(c.minValues) { 436 c.minValues = append(c.minValues, value.Clone()) 437 } else if c.columnType.Compare(c.minValues[c.page], value) > 0 { 438 c.minValues[c.page] = value.Clone() 439 } 440 441 if c.page >= len(c.maxValues) { 442 c.maxValues = append(c.maxValues, value.Clone()) 443 } else if c.columnType.Compare(c.maxValues[c.page], value) < 0 { 444 c.maxValues[c.page] = value.Clone() 445 } 446 } 447 448 func (c *columnStats) pageRead() { 449 c.page++ 450 } 451 452 type indexOrder int 453 454 const ( 455 invalidIndexOrder indexOrder = iota 456 unorderedIndexOrder 457 ascendingIndexOrder 458 descendingIndexOrder 459 ) 460 461 func (o indexOrder) String() string { 462 switch o { 463 case unorderedIndexOrder: 464 return "unordered" 465 case ascendingIndexOrder: 466 return "ascending" 467 case descendingIndexOrder: 468 return "descending" 469 default: 470 return "invalid" 471 } 472 } 473 474 func columnIndexOrder(index parquet.ColumnIndex) indexOrder { 475 switch { 476 case index.IsAscending() && index.IsDescending(): 477 return invalidIndexOrder 478 case index.IsAscending(): 479 return ascendingIndexOrder 480 case index.IsDescending(): 481 return descendingIndexOrder 482 default: 483 return unorderedIndexOrder 484 } 485 } 486 487 func observedIndexOrder(columnType parquet.Type, minValues []parquet.Value, maxValues []parquet.Value) indexOrder { 488 a := valueOrder(columnType, minValues) 489 b := valueOrder(columnType, maxValues) 490 491 switch { 492 case a == ascendingIndexOrder && b == ascendingIndexOrder: 493 return ascendingIndexOrder 494 case a == descendingIndexOrder && b == descendingIndexOrder: 495 return descendingIndexOrder 496 default: 497 return unorderedIndexOrder 498 } 499 } 500 501 func valueOrder(columnType parquet.Type, values []parquet.Value) indexOrder { 502 switch len(values) { 503 case 0, 1: 504 return unorderedIndexOrder 505 } 506 507 var order int 508 for i := 1; i < len(values); i++ { 509 next := columnType.Compare(values[i-1], values[i]) 510 if next == 0 { 511 continue 512 } 513 if order == 0 { 514 order = next 515 continue 516 } 517 if order != next { 518 return unorderedIndexOrder 519 } 520 } 521 522 if order > 0 { 523 return descendingIndexOrder 524 } 525 526 return ascendingIndexOrder 527 }