github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column_test.go (about)

     1  package parquet_test
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"math/rand"
     7  	"testing"
     8  
     9  	"github.com/google/uuid"
    10  
    11  	"github.com/parquet-go/parquet-go"
    12  	"github.com/parquet-go/parquet-go/deprecated"
    13  	"github.com/parquet-go/parquet-go/format"
    14  )
    15  
    16  func TestColumnPageIndex(t *testing.T) {
    17  	for _, config := range [...]struct {
    18  		name string
    19  		test func(*testing.T, rows) bool
    20  	}{
    21  		{
    22  			name: "buffer",
    23  			test: testColumnPageIndexWithBuffer,
    24  		},
    25  		{
    26  			name: "file",
    27  			test: testColumnPageIndexWithFile,
    28  		},
    29  	} {
    30  		t.Run(config.name, func(t *testing.T) {
    31  			for _, test := range [...]struct {
    32  				scenario string
    33  				function func(*testing.T) interface{}
    34  			}{
    35  				{
    36  					scenario: "boolean",
    37  					function: func(t *testing.T) interface{} {
    38  						return func(rows []struct{ Value bool }) bool { return config.test(t, makeRows(rows)) }
    39  					},
    40  				},
    41  
    42  				{
    43  					scenario: "int32",
    44  					function: func(t *testing.T) interface{} {
    45  						return func(rows []struct{ Value int32 }) bool { return config.test(t, makeRows(rows)) }
    46  					},
    47  				},
    48  
    49  				{
    50  					scenario: "int64",
    51  					function: func(t *testing.T) interface{} {
    52  						return func(rows []struct{ Value int64 }) bool { return config.test(t, makeRows(rows)) }
    53  					},
    54  				},
    55  
    56  				{
    57  					scenario: "int96",
    58  					function: func(t *testing.T) interface{} {
    59  						return func(rows []struct{ Value deprecated.Int96 }) bool { return config.test(t, makeRows(rows)) }
    60  					},
    61  				},
    62  
    63  				{
    64  					scenario: "uint32",
    65  					function: func(t *testing.T) interface{} {
    66  						return func(rows []struct{ Value uint32 }) bool { return config.test(t, makeRows(rows)) }
    67  					},
    68  				},
    69  
    70  				{
    71  					scenario: "uint64",
    72  					function: func(t *testing.T) interface{} {
    73  						return func(rows []struct{ Value uint64 }) bool { return config.test(t, makeRows(rows)) }
    74  					},
    75  				},
    76  
    77  				{
    78  					scenario: "float32",
    79  					function: func(t *testing.T) interface{} {
    80  						return func(rows []struct{ Value float32 }) bool { return config.test(t, makeRows(rows)) }
    81  					},
    82  				},
    83  
    84  				{
    85  					scenario: "float64",
    86  					function: func(t *testing.T) interface{} {
    87  						return func(rows []struct{ Value float64 }) bool { return config.test(t, makeRows(rows)) }
    88  					},
    89  				},
    90  
    91  				{
    92  					scenario: "string",
    93  					function: func(t *testing.T) interface{} {
    94  						return func(rows []struct{ Value string }) bool { return config.test(t, makeRows(rows)) }
    95  					},
    96  				},
    97  
    98  				{
    99  					scenario: "uuid",
   100  					function: func(t *testing.T) interface{} {
   101  						return func(rows []struct{ Value uuid.UUID }) bool { return config.test(t, makeRows(rows)) }
   102  					},
   103  				},
   104  			} {
   105  				t.Run(test.scenario, func(t *testing.T) {
   106  					if err := quickCheck(test.function(t)); err != nil {
   107  						t.Error(err)
   108  					}
   109  				})
   110  			}
   111  		})
   112  	}
   113  }
   114  
   115  func testColumnPageIndexWithBuffer(t *testing.T, rows rows) bool {
   116  	if len(rows) > 0 {
   117  		b := parquet.NewBuffer()
   118  		for _, row := range rows {
   119  			b.Write(row)
   120  		}
   121  		if err := checkRowGroupColumnIndex(b); err != nil {
   122  			t.Error(err)
   123  			return false
   124  		}
   125  		if err := checkRowGroupOffsetIndex(b); err != nil {
   126  			t.Error(err)
   127  			return false
   128  		}
   129  	}
   130  	return true
   131  }
   132  
   133  func checkRowGroupColumnIndex(rowGroup parquet.RowGroup) error {
   134  	for i, column := range rowGroup.ColumnChunks() {
   135  		if err := checkColumnChunkColumnIndex(column); err != nil {
   136  			return fmt.Errorf("column chunk @i=%d: %w", i, err)
   137  		}
   138  	}
   139  	return nil
   140  }
   141  
   142  func checkColumnChunkColumnIndex(columnChunk parquet.ColumnChunk) error {
   143  	columnType := columnChunk.Type()
   144  	columnIndex, _ := columnChunk.ColumnIndex()
   145  	numPages := columnIndex.NumPages()
   146  	pagesRead := 0
   147  	stats := newColumnStats(columnType)
   148  	pages := columnChunk.Pages()
   149  	defer pages.Close()
   150  
   151  	err := forEachPage(pages, func(page parquet.Page) error {
   152  		pageMin, pageMax, hasBounds := page.Bounds()
   153  		if !hasBounds {
   154  			return fmt.Errorf("page bounds are missing")
   155  		}
   156  		indexMin := columnIndex.MinValue(pagesRead)
   157  		indexMax := columnIndex.MaxValue(pagesRead)
   158  
   159  		if !parquet.Equal(pageMin, indexMin) {
   160  			return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMin, pageMin)
   161  		}
   162  		if !parquet.Equal(pageMax, indexMax) {
   163  			return fmt.Errorf("max page value mismatch: index=%q page=%q", indexMax, pageMax)
   164  		}
   165  
   166  		numNulls := int64(0)
   167  		numValues := int64(0)
   168  		err := forEachValue(page.Values(), func(value parquet.Value) error {
   169  			stats.observe(value)
   170  			if value.IsNull() {
   171  				numNulls++
   172  			}
   173  			numValues++
   174  			return nil
   175  		})
   176  		if err != nil {
   177  			return err
   178  		}
   179  
   180  		nullCount := columnIndex.NullCount(pagesRead)
   181  		if numNulls != nullCount {
   182  			return fmt.Errorf("number of null values mimatch: index=%d page=%d", nullCount, numNulls)
   183  		}
   184  
   185  		nullPage := columnIndex.NullPage(pagesRead)
   186  		if numNulls > 0 && numNulls == numValues && !nullPage {
   187  			return fmt.Errorf("page only contained null values but the index did not categorize it as a null page: nulls=%d", numNulls)
   188  		}
   189  
   190  		stats.pageRead()
   191  		pagesRead++
   192  		return nil
   193  	})
   194  	if err != nil {
   195  		return fmt.Errorf("page @i=%d: %w", pagesRead, err)
   196  	}
   197  	if pagesRead != numPages {
   198  		return fmt.Errorf("number of pages found in column index differs from the number of pages read: index=%d read=%d", numPages, pagesRead)
   199  	}
   200  
   201  	actualOrder := columnIndexOrder(columnIndex)
   202  	observedOrder := observedIndexOrder(columnType, stats.minValues, stats.maxValues)
   203  	xorAscending := (columnIndex.IsAscending() || observedOrder == ascendingIndexOrder) &&
   204  		!(columnIndex.IsAscending() && observedOrder == ascendingIndexOrder)
   205  	xorDescending := (columnIndex.IsDescending() || observedOrder == descendingIndexOrder) &&
   206  		!(columnIndex.IsDescending() && observedOrder == descendingIndexOrder)
   207  
   208  	if xorAscending || xorDescending {
   209  		return fmt.Errorf("column index is declared to be %s while observed values %s (min values %s, max values %s)",
   210  			actualOrder,
   211  			observedOrder,
   212  			valueOrder(columnType, stats.minValues),
   213  			valueOrder(columnType, stats.maxValues),
   214  		)
   215  	}
   216  
   217  	return nil
   218  }
   219  
   220  func checkRowGroupOffsetIndex(rowGroup parquet.RowGroup) error {
   221  	for i, column := range rowGroup.ColumnChunks() {
   222  		if err := checkColumnChunkOffsetIndex(column); err != nil {
   223  			return fmt.Errorf("column chunk @i=%d: %w", i, err)
   224  		}
   225  	}
   226  	return nil
   227  }
   228  
   229  func checkColumnChunkOffsetIndex(columnChunk parquet.ColumnChunk) error {
   230  	offsetIndex, _ := columnChunk.OffsetIndex()
   231  	numPages := offsetIndex.NumPages()
   232  	pagesRead := 0
   233  	rowIndex := int64(0)
   234  
   235  	pages := columnChunk.Pages()
   236  	defer pages.Close()
   237  
   238  	err := forEachPage(pages, func(page parquet.Page) error {
   239  		if firstRowIndex := offsetIndex.FirstRowIndex(pagesRead); firstRowIndex != rowIndex {
   240  			return fmt.Errorf("row number mismatch: index=%d page=%d", firstRowIndex, rowIndex)
   241  		}
   242  		rowIndex += int64(page.NumRows())
   243  		pagesRead++
   244  		return nil
   245  	})
   246  	if err != nil {
   247  		return fmt.Errorf("page @i=%d: %w", pagesRead, err)
   248  	}
   249  
   250  	if pagesRead != numPages {
   251  		return fmt.Errorf("number of pages found in offset index differs from the number of pages read: index=%d read=%d", numPages, pagesRead)
   252  	}
   253  
   254  	return nil
   255  }
   256  
   257  func testColumnPageIndexWithFile(t *testing.T, rows rows) bool {
   258  	if len(rows) > 0 {
   259  		r := rand.New(rand.NewSource(5))
   260  		f, err := createParquetFile(rows,
   261  			parquet.PageBufferSize(r.Intn(49)+1),
   262  			parquet.ColumnIndexSizeLimit(4096),
   263  		)
   264  		if err != nil {
   265  			t.Error(err)
   266  			return false
   267  		}
   268  		if err := checkFileColumnIndex(f); err != nil {
   269  			t.Error(err)
   270  			return false
   271  		}
   272  		if err := checkFileOffsetIndex(f); err != nil {
   273  			t.Error(err)
   274  			return false
   275  		}
   276  		for i, rowGroup := range f.RowGroups() {
   277  			if err := checkRowGroupColumnIndex(rowGroup); err != nil {
   278  				t.Errorf("checking column index of row group @i=%d: %v", i, err)
   279  				return false
   280  			}
   281  			if err := checkRowGroupOffsetIndex(rowGroup); err != nil {
   282  				t.Errorf("checking offset index of row group @i=%d: %v", i, err)
   283  				return false
   284  			}
   285  		}
   286  	}
   287  	return true
   288  }
   289  
   290  func checkFileColumnIndex(f *parquet.File) error {
   291  	columnIndexes := f.ColumnIndexes()
   292  	i := 0
   293  	return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error {
   294  		columnIndex, _ := chunk.ColumnIndex()
   295  		if n := columnIndex.NumPages(); n <= 0 {
   296  			return fmt.Errorf("invalid number of pages found in the column index: %d", n)
   297  		}
   298  		if i >= len(columnIndexes) {
   299  			return fmt.Errorf("more column indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(columnIndexes))
   300  		}
   301  
   302  		index1 := columnIndex
   303  		index2 := &fileColumnIndex{
   304  			kind:        col.Type().Kind(),
   305  			ColumnIndex: columnIndexes[i],
   306  		}
   307  
   308  		numPages1 := index1.NumPages()
   309  		numPages2 := index2.NumPages()
   310  		if numPages1 != numPages2 {
   311  			return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2)
   312  		}
   313  
   314  		for j := 0; j < numPages1; j++ {
   315  			nullCount1 := index1.NullCount(j)
   316  			nullCount2 := index2.NullCount(j)
   317  			if nullCount1 != nullCount2 {
   318  				return fmt.Errorf("null count of page %d/%d mismatch: got=%d want=%d", i, numPages1, nullCount1, nullCount2)
   319  			}
   320  
   321  			nullPage1 := index1.NullPage(j)
   322  			nullPage2 := index2.NullPage(j)
   323  			if nullPage1 != nullPage2 {
   324  				return fmt.Errorf("null page of page %d/%d mismatch: got=%t want=%t", i, numPages1, nullPage1, nullPage2)
   325  			}
   326  
   327  			minValue1 := index1.MinValue(j)
   328  			minValue2 := index2.MinValue(j)
   329  			if !parquet.Equal(minValue1, minValue2) {
   330  				return fmt.Errorf("min value of page %d/%d mismatch: got=%v want=%v", i, numPages1, minValue1, minValue2)
   331  			}
   332  
   333  			maxValue1 := index1.MaxValue(j)
   334  			maxValue2 := index2.MaxValue(j)
   335  			if !parquet.Equal(maxValue1, maxValue2) {
   336  				return fmt.Errorf("max value of page %d/%d mismatch: got=%v want=%v", i, numPages1, maxValue1, maxValue2)
   337  			}
   338  
   339  			isAscending1 := index1.IsAscending()
   340  			isAscending2 := index2.IsAscending()
   341  			if isAscending1 != isAscending2 {
   342  				return fmt.Errorf("ascending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isAscending1, isAscending2)
   343  			}
   344  
   345  			isDescending1 := index1.IsDescending()
   346  			isDescending2 := index2.IsDescending()
   347  			if isDescending1 != isDescending2 {
   348  				return fmt.Errorf("descending state of page %d/%d mismatch: got=%t want=%t", i, numPages1, isDescending1, isDescending2)
   349  			}
   350  		}
   351  
   352  		i++
   353  		return nil
   354  	})
   355  }
   356  
   357  func checkFileOffsetIndex(f *parquet.File) error {
   358  	offsetIndexes := f.OffsetIndexes()
   359  	i := 0
   360  	return forEachColumnChunk(f, func(col *parquet.Column, chunk parquet.ColumnChunk) error {
   361  		offsetIndex, _ := chunk.OffsetIndex()
   362  		if n := offsetIndex.NumPages(); n <= 0 {
   363  			return fmt.Errorf("invalid number of pages found in the offset index: %d", n)
   364  		}
   365  		if i >= len(offsetIndexes) {
   366  			return fmt.Errorf("more offset indexes were read when iterating over column chunks than when reading from the file (i=%d,n=%d)", i, len(offsetIndexes))
   367  		}
   368  
   369  		index1 := offsetIndex
   370  		index2 := (*fileOffsetIndex)(&offsetIndexes[i])
   371  
   372  		numPages1 := index1.NumPages()
   373  		numPages2 := index2.NumPages()
   374  		if numPages1 != numPages2 {
   375  			return fmt.Errorf("number of pages mismatch: got=%d want=%d", numPages1, numPages2)
   376  		}
   377  
   378  		for j := 0; j < numPages1; j++ {
   379  			offset1 := index1.Offset(j)
   380  			offset2 := index2.Offset(j)
   381  			if offset1 != offset2 {
   382  				return fmt.Errorf("offsets of page %d/%d mismatch: got=%d want=%d", i, numPages1, offset1, offset2)
   383  			}
   384  
   385  			compressedPageSize1 := index1.CompressedPageSize(j)
   386  			compressedPageSize2 := index2.CompressedPageSize(j)
   387  			if compressedPageSize1 != compressedPageSize2 {
   388  				return fmt.Errorf("compressed page size of page %d/%d mismatch: got=%d want=%d", i, numPages1, compressedPageSize1, compressedPageSize2)
   389  			}
   390  
   391  			firstRowIndex1 := index1.FirstRowIndex(j)
   392  			firstRowIndex2 := index2.FirstRowIndex(j)
   393  			if firstRowIndex1 != firstRowIndex2 {
   394  				return fmt.Errorf("first row index of page %d/%d mismatch: got=%d want=%d", i, numPages1, firstRowIndex1, firstRowIndex2)
   395  			}
   396  		}
   397  
   398  		i++
   399  		return nil
   400  	})
   401  }
   402  
   403  type fileColumnIndex struct {
   404  	kind parquet.Kind
   405  	format.ColumnIndex
   406  }
   407  
   408  func (i *fileColumnIndex) NumPages() int                { return len(i.NullPages) }
   409  func (i *fileColumnIndex) NullCount(j int) int64        { return i.NullCounts[j] }
   410  func (i *fileColumnIndex) NullPage(j int) bool          { return i.NullPages[j] }
   411  func (i *fileColumnIndex) MinValue(j int) parquet.Value { return i.kind.Value(i.MinValues[j]) }
   412  func (i *fileColumnIndex) MaxValue(j int) parquet.Value { return i.kind.Value(i.MaxValues[j]) }
   413  func (i *fileColumnIndex) IsAscending() bool            { return i.BoundaryOrder == format.Ascending }
   414  func (i *fileColumnIndex) IsDescending() bool           { return i.BoundaryOrder == format.Descending }
   415  
   416  type fileOffsetIndex format.OffsetIndex
   417  
   418  func (i *fileOffsetIndex) NumPages() int      { return len(i.PageLocations) }
   419  func (i *fileOffsetIndex) Offset(j int) int64 { return i.PageLocations[j].Offset }
   420  func (i *fileOffsetIndex) CompressedPageSize(j int) int64 {
   421  	return int64(i.PageLocations[j].CompressedPageSize)
   422  }
   423  func (i *fileOffsetIndex) FirstRowIndex(j int) int64 { return i.PageLocations[j].FirstRowIndex }
   424  
   425  type columnStats struct {
   426  	page       int
   427  	columnType parquet.Type
   428  	minValues  []parquet.Value
   429  	maxValues  []parquet.Value
   430  }
   431  
   432  func newColumnStats(columnType parquet.Type) *columnStats {
   433  	return &columnStats{columnType: columnType}
   434  }
   435  
   436  func (c *columnStats) observe(value parquet.Value) {
   437  	if c.page >= len(c.minValues) {
   438  		c.minValues = append(c.minValues, value.Clone())
   439  	} else if c.columnType.Compare(c.minValues[c.page], value) > 0 {
   440  		c.minValues[c.page] = value.Clone()
   441  	}
   442  
   443  	if c.page >= len(c.maxValues) {
   444  		c.maxValues = append(c.maxValues, value.Clone())
   445  	} else if c.columnType.Compare(c.maxValues[c.page], value) < 0 {
   446  		c.maxValues[c.page] = value.Clone()
   447  	}
   448  }
   449  
   450  func (c *columnStats) pageRead() {
   451  	c.page++
   452  }
   453  
   454  type indexOrder int
   455  
   456  const (
   457  	invalidIndexOrder indexOrder = iota
   458  	unorderedIndexOrder
   459  	ascendingIndexOrder
   460  	descendingIndexOrder
   461  )
   462  
   463  func (o indexOrder) String() string {
   464  	switch o {
   465  	case unorderedIndexOrder:
   466  		return "unordered"
   467  	case ascendingIndexOrder:
   468  		return "ascending"
   469  	case descendingIndexOrder:
   470  		return "descending"
   471  	default:
   472  		return "invalid"
   473  	}
   474  }
   475  
   476  func columnIndexOrder(index parquet.ColumnIndex) indexOrder {
   477  	switch {
   478  	case index.IsAscending() && index.IsDescending():
   479  		return invalidIndexOrder
   480  	case index.IsAscending():
   481  		return ascendingIndexOrder
   482  	case index.IsDescending():
   483  		return descendingIndexOrder
   484  	default:
   485  		return unorderedIndexOrder
   486  	}
   487  }
   488  
   489  func observedIndexOrder(columnType parquet.Type, minValues []parquet.Value, maxValues []parquet.Value) indexOrder {
   490  	a := valueOrder(columnType, minValues)
   491  	b := valueOrder(columnType, maxValues)
   492  
   493  	switch {
   494  	case a == ascendingIndexOrder && b == ascendingIndexOrder:
   495  		return ascendingIndexOrder
   496  	case a == descendingIndexOrder && b == descendingIndexOrder:
   497  		return descendingIndexOrder
   498  	default:
   499  		return unorderedIndexOrder
   500  	}
   501  }
   502  
   503  func valueOrder(columnType parquet.Type, values []parquet.Value) indexOrder {
   504  	switch len(values) {
   505  	case 0, 1:
   506  		return unorderedIndexOrder
   507  	}
   508  
   509  	var order int
   510  	for i := 1; i < len(values); i++ {
   511  		next := columnType.Compare(values[i-1], values[i])
   512  		if next == 0 {
   513  			continue
   514  		}
   515  		if order == 0 {
   516  			order = next
   517  			continue
   518  		}
   519  		if order != next {
   520  			return unorderedIndexOrder
   521  		}
   522  	}
   523  
   524  	if order > 0 {
   525  		return descendingIndexOrder
   526  	}
   527  
   528  	return ascendingIndexOrder
   529  }
   530  
   531  func TestColumnPages_SeekToRow(t *testing.T) {
   532  	type Contact struct {
   533  		ID   int64  `parquet:"id"`
   534  		Name string `parquet:"name"`
   535  		Sex  bool   `parquet:"sex"`
   536  	}
   537  
   538  	buf := bytes.Buffer{}
   539  	writer := parquet.NewWriter(&buf)
   540  	data := [][]Contact{
   541  		{
   542  			{ID: 1, Name: "user1"},
   543  			{ID: 2, Name: "user2"},
   544  			{ID: 7, Name: "user7"},
   545  		},
   546  		{
   547  			{ID: 8, Name: "user8"},
   548  			{ID: 10, Name: "user10"},
   549  			{ID: 12, Name: "user12"},
   550  		},
   551  		{
   552  			{ID: 15, Name: "user15"},
   553  			{ID: 16, Name: "user16"},
   554  		},
   555  	}
   556  	for _, rows := range data {
   557  		for _, row := range rows {
   558  			err := writer.Write(&row)
   559  			if err != nil {
   560  				panic(err)
   561  			}
   562  		}
   563  		err := writer.Flush()
   564  		if err != nil {
   565  			panic(err)
   566  		}
   567  	}
   568  	err := writer.Close()
   569  	if err != nil {
   570  		panic(err)
   571  	}
   572  
   573  	pr, err := parquet.OpenFile(bytes.NewReader(buf.Bytes()), int64(buf.Len()))
   574  	if err != nil {
   575  		t.Error(err)
   576  	}
   577  
   578  	id := pr.Root().Column("id")
   579  
   580  	pages := id.Pages()
   581  	defer pages.Close()
   582  
   583  	var idx int64
   584  	for _, rows := range data {
   585  		for _, row := range rows {
   586  			err := pages.SeekToRow(idx)
   587  			if err != nil {
   588  				t.Error(err)
   589  			}
   590  
   591  			page, err := pages.ReadPage()
   592  			if err != nil {
   593  				t.Error(err)
   594  			}
   595  
   596  			var values [1]int64
   597  			page.Values().(interface {
   598  				ReadInt64s(values []int64) (n int, err error)
   599  			}).ReadInt64s(values[:])
   600  
   601  			if values[0] != row.ID {
   602  				t.Errorf("read value of page mismatch, row index %d: got=%d want=%d", idx, values[0], row.ID)
   603  			}
   604  
   605  			idx++
   606  		}
   607  	}
   608  }