github.com/parquet-go/parquet-go@v0.20.0/column_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"testing"
     7  
     8  	"github.com/google/uuid"
     9  
    10  	"github.com/parquet-go/parquet-go"
    11  	"github.com/parquet-go/parquet-go/deprecated"
    12  	"github.com/parquet-go/parquet-go/format"
    13  )
    14  
    15  func TestColumnPageIndex(t *testing.T) {
    16  	for _, config := range [...]struct {
    17  		name string
    18  		test func(*testing.T, rows) bool
    19  	}{
    20  		{
    21  			name: "buffer",
    22  			test: testColumnPageIndexWithBuffer,
    23  		},
    24  		{
    25  			name: "file",
    26  			test: testColumnPageIndexWithFile,
    27  		},
    28  	} {
    29  		t.Run(config.name, func(t *testing.T) {
    30  			for _, test := range [...]struct {
    31  				scenario string
    32  				function func(*testing.T) interface{}
    33  			}{
    34  				{
    35  					scenario: "boolean",
    36  					function: func(t *testing.T) interface{} {
    37  						return func(rows []struct{ Value bool }) bool { return config.test(t, makeRows(rows)) }
    38  					},
    39  				},
    40  
    41  				{
    42  					scenario: "int32",
    43  					function: func(t *testing.T) interface{} {
    44  						return func(rows []struct{ Value int32 }) bool { return config.test(t, makeRows(rows)) }
    45  					},
    46  				},
    47  
    48  				{
    49  					scenario: "int64",
    50  					function: func(t *testing.T) interface{} {
    51  						return func(rows []struct{ Value int64 }) bool { return config.test(t, makeRows(rows)) }
    52  					},
    53  				},
    54  
    55  				{
    56  					scenario: "int96",
    57  					function: func(t *testing.T) interface{} {
    58  						return func(rows []struct{ Value deprecated.Int96 }) bool { return config.test(t, makeRows(rows)) }
    59  					},
    60  				},
    61  
    62  				{
    63  					scenario: "uint32",
    64  					function: func(t *testing.T) interface{} {
    65  						return func(rows []struct{ Value uint32 }) bool { return config.test(t, makeRows(rows)) }
    66  					},
    67  				},
    68  
    69  				{
    70  					scenario: "uint64",
    71  					function: func(t *testing.T) interface{} {
    72  						return func(rows []struct{ Value uint64 }) bool { return config.test(t, makeRows(rows)) }
    73  					},
    74  				},
    75  
    76  				{
    77  					scenario: "float32",
    78  					function: func(t *testing.T) interface{} {
    79  						return func(rows []struct{ Value float32 }) bool { return config.test(t, makeRows(rows)) }
    80  					},
    81  				},
    82  
    83  				{
    84  					scenario: "float64",
    85  					function: func(t *testing.T) interface{} {
    86  						return func(rows []struct{ Value float64 }) bool { return config.test(t, makeRows(rows)) }
    87  					},
    88  				},
    89  
    90  				{
    91  					scenario: "string",
    92  					function: func(t *testing.T) interface{} {
    93  						return func(rows []struct{ Value string }) bool { return config.test(t, makeRows(rows)) }
    94  					},
    95  				},
    96  
    97  				{
    98  					scenario: "uuid",
    99  					function: func(t *testing.T) interface{} {
   100  						return func(rows []struct{ Value uuid.UUID }) bool { return config.test(t, makeRows(rows)) }
   101  					},
   102  				},
   103  			} {
   104  				t.Run(test.scenario, func(t *testing.T) {
   105  					if err := quickCheck(test.function(t)); err != nil {
   106  						t.Error(err)
   107  					}
   108  				})
   109  			}
   110  		})
   111  	}
   112  }
   113  
   114  func testColumnPageIndexWithBuffer(t *testing.T, rows rows) bool {
   115  	if len(rows) > 0 {
   116  		b := parquet.NewBuffer()
   117  		for _, row := range rows {
   118  			b.Write(row)
   119  		}
   120  		if err := checkRowGroupColumnIndex(b); err != nil {
   121  			t.Error(err)
   122  			return false
   123  		}
   124  		if err := checkRowGroupOffsetIndex(b); err != nil {
   125  			t.Error(err)
   126  			return false
   127  		}
   128  	}
   129  	return true
   130  }
   131  
   132  func checkRowGroupColumnIndex(rowGroup parquet.RowGroup) error {
   133  	for i, column := range rowGroup.ColumnChunks() {
   134  		if err := checkColumnChunkColumnIndex(column); err != nil {
   135  			return fmt.Errorf("column chunk @i=%d: %w", i, err)
   136  		}
   137  	}
   138  	return nil
   139  }
   140  
   141  func checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error {
   142  	columnType := columnChunk.Type()
   143  	columnIndex, _ := columnChunk.ColumnIndex()
   144  	numPages := columnIndex.NumPages()
   145  	pagesRead := 0
   146  	stats := newColumnStats(columnType)
   147  	pages := columnChunk.Pages()
   148  	defer pages.Close()
   149  
   150  	err := forEachPage(pages, func(page parquet.Page) error {
   151  		pageMin, pageMax, hasBounds := page.Bounds()
   152  		if !hasBounds {
   153  			return fmt.Errorf("page bounds are missing")
   154  		}
   155  		indexMin := columnIndex.MinValue(pagesRead)
   156  		indexMax := columnIndex.MaxValue(pagesRead)
   157  
   158  		if !parquet.Equal(pageMin, indexMin) {
   159  			return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMin, pageMin)
   160  		}
   161  		if !parquet.Equal(pageMax, indexMax) {
   162  			return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMax, pageMax)
   163  		}
   164  
   165  		numNulls := int64(0)
   166  		numValues := int64(0)
   167  		err := forEachValue(page.Values(), func(value parquet.Value) error {
   168  			stats.observe(value)
   169  			if value.IsNull() {
   170  				numNulls++
   171  			}
   172  			numValues++
   173  			return nil
   174  		})
   175  		if err != nil {
   176  			return err
   177  		}
   178  
   179  		nullCount := columnIndex.NullCount(pagesRead)
   180  		if numNulls != nullCount {
   181  			return fmt.Errorf("number of null values mimatch: index=%d page=%d", nullCount, numNulls)
   182  		}
   183  
   184  		nullPage := columnIndex.NullPage(pagesRead)
   185  		if numNulls > 0 && numNulls == numValues && !nullPage {
   186  			return fmt.Errorf("page only contained null values but the index did not categorize it as a null page: nulls=%d", numNulls)
   187  		}
   188  
   189  		stats.pageRead()
   190  		pagesRead++
   191  		return nil
   192  	})
   193  	if err != nil {
   194  		return fmt.Errorf("page @i=%d: %w", pagesRead, err)
   195  	}
   196  	if pagesRead != numPages {
   197  		return fmt.Errorf("number of pages found in column index differs from the number of pages read: index=%d read=%d", numPages, pagesRead)
   198  	}
   199  
   200  	actualOrder := columnIndexOrder(columnIndex)
   201  	observedOrder := observedIndexOrder(columnType, stats.minValues, stats.maxValues)
   202  	xorAscending := (columnIndex.IsAscending() || observedOrder == ascendingIndexOrder) &&
   203  		!(columnIndex.IsAscending() && observedOrder == ascendingIndexOrder)
   204  	xorDescending := (columnIndex.IsDescending() || observedOrder == descendingIndexOrder) &&
   205  		!(columnIndex.IsDescending() && observedOrder == descendingIndexOrder)
   206  
   207  	if xorAscending || xorDescending {
   208  		return fmt.Errorf("column index is declared to be %s while observed values %s (min values %s, max values %s)",
   209  			actualOrder,
   210  			observedOrder,
   211  			valueOrder(columnType, stats.minValues),
   212  			valueOrder(columnType, stats.maxValues),
   213  		)
   214  	}
   215  
   216  	return nil
   217  }
   218  
   219  func checkRowGroupOffsetIndex(rowGroup parquet.RowGroup) error {
   220  	for i, column := range rowGroup.ColumnChunks() {
   221  		if err := checkColumnChunkOffsetIndex(column); err != nil {
   222  			return fmt.Errorf("column chunk @i=%d: %w", i, err)
   223  		}
   224  	}
   225  	return nil
   226  }
   227  
   228  func checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error {
   229  	offsetIndex, _ := columnChunk.OffsetIndex()
   230  	numPages := offsetIndex.NumPages()
   231  	pagesRead := 0
   232  	rowIndex := int64(0)
   233  
   234  	pages := columnChunk.Pages()
   235  	defer pages.Close()
   236  
   237  	err := forEachPage(pages, func(page parquet.Page) error {
   238  		if firstRowIndex := offsetIndex.FirstRowIndex(pagesRead); firstRowIndex != rowIndex {
   239  			return fmt.Errorf("row number mismatch: index=%d page=%d", firstRowIndex, rowIndex)
   240  		}
   241  		rowIndex += int64(page.NumRows())
   242  		pagesRead++
   243  		return nil
   244  	})
   245  	if err != nil {
   246  		return fmt.Errorf("page @i=%d: %w", pagesRead, err)
   247  	}
   248  
   249  	if pagesRead != numPages {
   250  		return fmt.Errorf("number of pages found in offset index differs from the number of pages read: index=%d read=%d", numPages, pagesRead)
   251  	}
   252  
   253  	return nil
   254  }
   255  
   256  func testColumnPageIndexWithFile(t *testing.T, rows rows) bool {
   257  	if len(rows) > 0 {
   258  		r := rand.New(rand.NewSource(5))
   259  		f, err := createParquetFile(rows,
   260  			parquet.PageBufferSize(r.Intn(49)+1),
   261  			parquet.ColumnIndexSizeLimit(4096),
   262  		)
   263  		if err != nil {
   264  			t.Error(err)
   265  			return false
   266  		}
   267  		if err := checkFileColumnIndex(f); err != nil {
   268  			t.Error(err)
   269  			return false
   270  		}
   271  		if err := checkFileOffsetIndex(f); err != nil {
   272  			t.Error(err)
   273  			return false
   274  		}
   275  		for i, rowGroup := range f.RowGroups() {
   276  			if err := checkRowGroupColumnIndex(rowGroup); err != nil {
   277  				t.Errorf("checking column index of row group @i=%d: %v", i, err)
   278  				return false
   279  			}
   280  			if err := checkRowGroupOffsetIndex(rowGroup); err != nil {
   281  				t.Errorf("checking offset index of row group @i=%d: %v", i, err)
   282  				return false
   283  			}
   284  		}
   285  	}
   286  	return true
   287  }
   288  
   289  func checkFileColumnIndex(f *parquet.File) error {
   290  	columnIndexes := f.ColumnIndexes()
   291  	i := 0
   292  	return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error {
   293  		columnIndex, _ := chunk.ColumnIndex()
   294  		if n := columnIndex.NumPages(); n <= 0 {
   295  			return fmt.Errorf("invalid number of pages found in the column index: %d", n)
   296  		}
   297  		if i >= len(columnIndexes) {
   298  			return fmt.Errorf("more column indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(columnIndexes))
   299  		}
   300  
   301  		index1 := columnIndex
   302  		index2 := &fileColumnIndex{
   303  			kind:        col.Type().Kind(),
   304  			ColumnIndex: columnIndexes[i],
   305  		}
   306  
   307  		numPages1 := index1.NumPages()
   308  		numPages2 := index2.NumPages()
   309  		if numPages1 != numPages2 {
   310  			return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2)
   311  		}
   312  
   313  		for j := 0; j < numPages1; j++ {
   314  			nullCount1 := index1.NullCount(j)
   315  			nullCount2 := index2.NullCount(j)
   316  			if nullCount1 != nullCount2 {
   317  				return fmt.Errorf("null count of page %d/%d mismatch: got=%d want=%d", i, numPages1, nullCount1, nullCount2)
   318  			}
   319  
   320  			nullPage1 := index1.NullPage(j)
   321  			nullPage2 := index2.NullPage(j)
   322  			if nullPage1 != nullPage2 {
   323  				return fmt.Errorf("null page of page %d/%d mismatch: got=%t want=%t", i, numPages1, nullPage1, nullPage2)
   324  			}
   325  
   326  			minValue1 := index1.MinValue(j)
   327  			minValue2 := index2.MinValue(j)
   328  			if !parquet.Equal(minValue1, minValue2) {
   329  				return fmt.Errorf("min value of page %d/%d mismatch: got=%v want=%v", i, numPages1, minValue1, minValue2)
   330  			}
   331  
   332  			maxValue1 := index1.MaxValue(j)
   333  			maxValue2 := index2.MaxValue(j)
   334  			if !parquet.Equal(maxValue1, maxValue2) {
   335  				return fmt.Errorf("max value of page %d/%d mismatch: got=%v want=%v", i, numPages1, maxValue1, maxValue2)
   336  			}
   337  
   338  			isAscending1 := index1.IsAscending()
   339  			isAscending2 := index2.IsAscending()
   340  			if isAscending1 != isAscending2 {
   341  				return fmt.Errorf("ascending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isAscending1, isAscending2)
   342  			}
   343  
   344  			isDescending1 := index1.IsDescending()
   345  			isDescending2 := index2.IsDescending()
   346  			if isDescending1 != isDescending2 {
   347  				return fmt.Errorf("descending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isDescending1, isDescending2)
   348  			}
   349  		}
   350  
   351  		i++
   352  		return nil
   353  	})
   354  }
   355  
   356  func checkFileOffsetIndex(f *parquet.File) error {
   357  	offsetIndexes := f.OffsetIndexes()
   358  	i := 0
   359  	return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error {
   360  		offsetIndex, _ := chunk.OffsetIndex()
   361  		if n := offsetIndex.NumPages(); n <= 0 {
   362  			return fmt.Errorf("invalid number of pages found in the offset index: %d", n)
   363  		}
   364  		if i >= len(offsetIndexes) {
   365  			return fmt.Errorf("more offset indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(offsetIndexes))
   366  		}
   367  
   368  		index1 := offsetIndex
   369  		index2 := (*fileOffsetIndex)(&offsetIndexes[i])
   370  
   371  		numPages1 := index1.NumPages()
   372  		numPages2 := index2.NumPages()
   373  		if numPages1 != numPages2 {
   374  			return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2)
   375  		}
   376  
   377  		for j := 0; j < numPages1; j++ {
   378  			offset1 := index1.Offset(j)
   379  			offset2 := index2.Offset(j)
   380  			if offset1 != offset2 {
   381  				return fmt.Errorf("offsets of page %d/%d mismatch: got=%d want=%d", i, numPages1, offset1, offset2)
   382  			}
   383  
   384  			compressedPageSize1 := index1.CompressedPageSize(j)
   385  			compressedPageSize2 := index2.CompressedPageSize(j)
   386  			if compressedPageSize1 != compressedPageSize2 {
   387  				return fmt.Errorf("compressed page size of page %d/%d mismatch: got=%d want=%d", i, numPages1, compressedPageSize1, compressedPageSize2)
   388  			}
   389  
   390  			firstRowIndex1 := index1.FirstRowIndex(j)
   391  			firstRowIndex2 := index2.FirstRowIndex(j)
   392  			if firstRowIndex1 != firstRowIndex2 {
   393  				return fmt.Errorf("first row index of page %d/%d mismatch: got=%d want=%d", i, numPages1, firstRowIndex1, firstRowIndex2)
   394  			}
   395  		}
   396  
   397  		i++
   398  		return nil
   399  	})
   400  }
   401  
   402  type fileColumnIndex struct {
   403  	kind parquet.Kind
   404  	format.ColumnIndex
   405  }
   406  
   407  func (i *fileColumnIndex) NumPages() int                { return len(i.NullPages) }
   408  func (i *fileColumnIndex) NullCount(j int) int64        { return i.NullCounts[j] }
   409  func (i *fileColumnIndex) NullPage(j int) bool          { return i.NullPages[j] }
   410  func (i *fileColumnIndex) MinValue(j int) parquet.Value { return i.kind.Value(i.MinValues[j]) }
   411  func (i *fileColumnIndex) MaxValue(j int) parquet.Value { return i.kind.Value(i.MaxValues[j]) }
   412  func (i *fileColumnIndex) IsAscending() bool            { return i.BoundaryOrder == format.Ascending }
   413  func (i *fileColumnIndex) IsDescending() bool           { return i.BoundaryOrder == format.Descending }
   414  
   415  type fileOffsetIndex format.OffsetIndex
   416  
   417  func (i *fileOffsetIndex) NumPages() int      { return len(i.PageLocations) }
   418  func (i *fileOffsetIndex) Offset(j int) int64 { return i.PageLocations[j].Offset }
   419  func (i *fileOffsetIndex) CompressedPageSize(j int) int64 {
   420  	return int64(i.PageLocations[j].CompressedPageSize)
   421  }
   422  func (i *fileOffsetIndex) FirstRowIndex(j int) int64 { return i.PageLocations[j].FirstRowIndex }
   423  
   424  type columnStats struct {
   425  	page       int
   426  	columnType parquet.Type
   427  	minValues  []parquet.Value
   428  	maxValues  []parquet.Value
   429  }
   430  
   431  func newColumnStats(columnType parquet.Type) *columnStats {
   432  	return &columnStats{columnType: columnType}
   433  }
   434  
   435  func (c *columnStats) observe(value parquet.Value) {
   436  	if c.page >= len(c.minValues) {
   437  		c.minValues = append(c.minValues, value.Clone())
   438  	} else if c.columnType.Compare(c.minValues[c.page], value) > 0 {
   439  		c.minValues[c.page] = value.Clone()
   440  	}
   441  
   442  	if c.page >= len(c.maxValues) {
   443  		c.maxValues = append(c.maxValues, value.Clone())
   444  	} else if c.columnType.Compare(c.maxValues[c.page], value) < 0 {
   445  		c.maxValues[c.page] = value.Clone()
   446  	}
   447  }
   448  
   449  func (c *columnStats) pageRead() {
   450  	c.page++
   451  }
   452  
   453  type indexOrder int
   454  
   455  const (
   456  	invalidIndexOrder indexOrder = iota
   457  	unorderedIndexOrder
   458  	ascendingIndexOrder
   459  	descendingIndexOrder
   460  )
   461  
   462  func (o indexOrder) String() string {
   463  	switch o {
   464  	case unorderedIndexOrder:
   465  		return "unordered"
   466  	case ascendingIndexOrder:
   467  		return "ascending"
   468  	case descendingIndexOrder:
   469  		return "descending"
   470  	default:
   471  		return "invalid"
   472  	}
   473  }
   474  
   475  func columnIndexOrder(index parquet.ColumnIndex) indexOrder {
   476  	switch {
   477  	case index.IsAscending() && index.IsDescending():
   478  		return invalidIndexOrder
   479  	case index.IsAscending():
   480  		return ascendingIndexOrder
   481  	case index.IsDescending():
   482  		return descendingIndexOrder
   483  	default:
   484  		return unorderedIndexOrder
   485  	}
   486  }
   487  
   488  func observedIndexOrder(columnType parquet.Type, minValues []parquet.Value, maxValues []parquet.Value) indexOrder {
   489  	a := valueOrder(columnType, minValues)
   490  	b := valueOrder(columnType, maxValues)
   491  
   492  	switch {
   493  	case a == ascendingIndexOrder && b == ascendingIndexOrder:
   494  		return ascendingIndexOrder
   495  	case a == descendingIndexOrder && b == descendingIndexOrder:
   496  		return descendingIndexOrder
   497  	default:
   498  		return unorderedIndexOrder
   499  	}
   500  }
   501  
   502  func valueOrder(columnType parquet.Type, values []parquet.Value) indexOrder {
   503  	switch len(values) {
   504  	case 0, 1:
   505  		return unorderedIndexOrder
   506  	}
   507  
   508  	var order int
   509  	for i := 1; i < len(values); i++ {
   510  		next := columnType.Compare(values[i-1], values[i])
   511  		if next == 0 {
   512  			continue
   513  		}
   514  		if order == 0 {
   515  			order = next
   516  			continue
   517  		}
   518  		if order != next {
   519  			return unorderedIndexOrder
   520  		}
   521  	}
   522  
   523  	if order > 0 {
   524  		return descendingIndexOrder
   525  	}
   526  
   527  	return ascendingIndexOrder
   528  }