github.com/parquet-go/parquet-go@v0.21.1-0.20240501160520-b3c3a0c3ed6f/column_index.go (about)

     1  package parquet
     2  
     3  import (
     4  	"github.com/parquet-go/parquet-go/deprecated"
     5  	"github.com/parquet-go/parquet-go/encoding/plain"
     6  	"github.com/parquet-go/parquet-go/format"
     7  	"github.com/parquet-go/parquet-go/internal/unsafecast"
     8  )
     9  
    10  type ColumnIndex interface {
    11  	// NumPages returns the number of paged in the column index.
    12  	NumPages() int
    13  
    14  	// Returns the number of null values in the page at the given index.
    15  	NullCount(int) int64
    16  
    17  	// Tells whether the page at the given index contains null values only.
    18  	NullPage(int) bool
    19  
    20  	// PageIndex return min/max bounds for the page at the given index in the
    21  	// column.
    22  	MinValue(int) Value
    23  	MaxValue(int) Value
    24  
    25  	// IsAscending returns true if the column index min/max values are sorted
    26  	// in ascending order (based on the ordering rules of the column's logical
    27  	// type).
    28  	IsAscending() bool
    29  
    30  	// IsDescending returns true if the column index min/max values are sorted
    31  	// in descending order (based on the ordering rules of the column's logical
    32  	// type).
    33  	IsDescending() bool
    34  }
    35  
    36  // NewColumnIndex constructs a ColumnIndex instance from the given parquet
    37  // format column index. The kind argument configures the type of values
    38  func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex {
    39  	return &formatColumnIndex{
    40  		kind:  kind,
    41  		index: index,
    42  	}
    43  }
    44  
    45  type formatColumnIndex struct {
    46  	kind  Kind
    47  	index *format.ColumnIndex
    48  }
    49  
    50  func (f *formatColumnIndex) NumPages() int {
    51  	return len(f.index.MinValues)
    52  }
    53  
    54  func (f *formatColumnIndex) NullCount(i int) int64 {
    55  	if len(f.index.NullCounts) > 0 {
    56  		return f.index.NullCounts[i]
    57  	}
    58  	return 0
    59  }
    60  
    61  func (f *formatColumnIndex) NullPage(i int) bool {
    62  	return len(f.index.NullPages) > 0 && f.index.NullPages[i]
    63  }
    64  
    65  func (f *formatColumnIndex) MinValue(i int) Value {
    66  	if f.NullPage(i) {
    67  		return Value{}
    68  	}
    69  	return f.kind.Value(f.index.MinValues[i])
    70  }
    71  
    72  func (f *formatColumnIndex) MaxValue(i int) Value {
    73  	if f.NullPage(i) {
    74  		return Value{}
    75  	}
    76  	return f.kind.Value(f.index.MaxValues[i])
    77  }
    78  
    79  func (f *formatColumnIndex) IsAscending() bool {
    80  	return f.index.BoundaryOrder == format.Ascending
    81  }
    82  
    83  func (f *formatColumnIndex) IsDescending() bool {
    84  	return f.index.BoundaryOrder == format.Descending
    85  }
    86  
    87  type fileColumnIndex struct{ chunk *fileColumnChunk }
    88  
    89  func (i fileColumnIndex) NumPages() int {
    90  	return len(i.chunk.columnIndex.NullPages)
    91  }
    92  
    93  func (i fileColumnIndex) NullCount(j int) int64 {
    94  	if len(i.chunk.columnIndex.NullCounts) > 0 {
    95  		return i.chunk.columnIndex.NullCounts[j]
    96  	}
    97  	return 0
    98  }
    99  
   100  func (i fileColumnIndex) NullPage(j int) bool {
   101  	return len(i.chunk.columnIndex.NullPages) > 0 && i.chunk.columnIndex.NullPages[j]
   102  }
   103  
   104  func (i fileColumnIndex) MinValue(j int) Value {
   105  	if i.NullPage(j) {
   106  		return Value{}
   107  	}
   108  	return i.makeValue(i.chunk.columnIndex.MinValues[j])
   109  }
   110  
   111  func (i fileColumnIndex) MaxValue(j int) Value {
   112  	if i.NullPage(j) {
   113  		return Value{}
   114  	}
   115  	return i.makeValue(i.chunk.columnIndex.MaxValues[j])
   116  }
   117  
   118  func (i fileColumnIndex) IsAscending() bool {
   119  	return i.chunk.columnIndex.BoundaryOrder == format.Ascending
   120  }
   121  
   122  func (i fileColumnIndex) IsDescending() bool {
   123  	return i.chunk.columnIndex.BoundaryOrder == format.Descending
   124  }
   125  
   126  func (i *fileColumnIndex) makeValue(b []byte) Value {
   127  	return i.chunk.column.typ.Kind().Value(b)
   128  }
   129  
   130  type emptyColumnIndex struct{}
   131  
   132  func (emptyColumnIndex) NumPages() int       { return 0 }
   133  func (emptyColumnIndex) NullCount(int) int64 { return 0 }
   134  func (emptyColumnIndex) NullPage(int) bool   { return false }
   135  func (emptyColumnIndex) MinValue(int) Value  { return Value{} }
   136  func (emptyColumnIndex) MaxValue(int) Value  { return Value{} }
   137  func (emptyColumnIndex) IsAscending() bool   { return false }
   138  func (emptyColumnIndex) IsDescending() bool  { return false }
   139  
   140  type booleanColumnIndex struct{ page *booleanPage }
   141  
   142  func (i booleanColumnIndex) NumPages() int       { return 1 }
   143  func (i booleanColumnIndex) NullCount(int) int64 { return 0 }
   144  func (i booleanColumnIndex) NullPage(int) bool   { return false }
   145  func (i booleanColumnIndex) MinValue(int) Value  { return makeValueBoolean(i.page.min()) }
   146  func (i booleanColumnIndex) MaxValue(int) Value  { return makeValueBoolean(i.page.max()) }
   147  func (i booleanColumnIndex) IsAscending() bool   { return false }
   148  func (i booleanColumnIndex) IsDescending() bool  { return false }
   149  
   150  type int32ColumnIndex struct{ page *int32Page }
   151  
   152  func (i int32ColumnIndex) NumPages() int       { return 1 }
   153  func (i int32ColumnIndex) NullCount(int) int64 { return 0 }
   154  func (i int32ColumnIndex) NullPage(int) bool   { return false }
   155  func (i int32ColumnIndex) MinValue(int) Value  { return makeValueInt32(i.page.min()) }
   156  func (i int32ColumnIndex) MaxValue(int) Value  { return makeValueInt32(i.page.max()) }
   157  func (i int32ColumnIndex) IsAscending() bool   { return false }
   158  func (i int32ColumnIndex) IsDescending() bool  { return false }
   159  
   160  type int64ColumnIndex struct{ page *int64Page }
   161  
   162  func (i int64ColumnIndex) NumPages() int       { return 1 }
   163  func (i int64ColumnIndex) NullCount(int) int64 { return 0 }
   164  func (i int64ColumnIndex) NullPage(int) bool   { return false }
   165  func (i int64ColumnIndex) MinValue(int) Value  { return makeValueInt64(i.page.min()) }
   166  func (i int64ColumnIndex) MaxValue(int) Value  { return makeValueInt64(i.page.max()) }
   167  func (i int64ColumnIndex) IsAscending() bool   { return false }
   168  func (i int64ColumnIndex) IsDescending() bool  { return false }
   169  
   170  type int96ColumnIndex struct{ page *int96Page }
   171  
   172  func (i int96ColumnIndex) NumPages() int       { return 1 }
   173  func (i int96ColumnIndex) NullCount(int) int64 { return 0 }
   174  func (i int96ColumnIndex) NullPage(int) bool   { return false }
   175  func (i int96ColumnIndex) MinValue(int) Value  { return makeValueInt96(i.page.min()) }
   176  func (i int96ColumnIndex) MaxValue(int) Value  { return makeValueInt96(i.page.max()) }
   177  func (i int96ColumnIndex) IsAscending() bool   { return false }
   178  func (i int96ColumnIndex) IsDescending() bool  { return false }
   179  
   180  type floatColumnIndex struct{ page *floatPage }
   181  
   182  func (i floatColumnIndex) NumPages() int       { return 1 }
   183  func (i floatColumnIndex) NullCount(int) int64 { return 0 }
   184  func (i floatColumnIndex) NullPage(int) bool   { return false }
   185  func (i floatColumnIndex) MinValue(int) Value  { return makeValueFloat(i.page.min()) }
   186  func (i floatColumnIndex) MaxValue(int) Value  { return makeValueFloat(i.page.max()) }
   187  func (i floatColumnIndex) IsAscending() bool   { return false }
   188  func (i floatColumnIndex) IsDescending() bool  { return false }
   189  
   190  type doubleColumnIndex struct{ page *doublePage }
   191  
   192  func (i doubleColumnIndex) NumPages() int       { return 1 }
   193  func (i doubleColumnIndex) NullCount(int) int64 { return 0 }
   194  func (i doubleColumnIndex) NullPage(int) bool   { return false }
   195  func (i doubleColumnIndex) MinValue(int) Value  { return makeValueDouble(i.page.min()) }
   196  func (i doubleColumnIndex) MaxValue(int) Value  { return makeValueDouble(i.page.max()) }
   197  func (i doubleColumnIndex) IsAscending() bool   { return false }
   198  func (i doubleColumnIndex) IsDescending() bool  { return false }
   199  
   200  type byteArrayColumnIndex struct{ page *byteArrayPage }
   201  
   202  func (i byteArrayColumnIndex) NumPages() int       { return 1 }
   203  func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 }
   204  func (i byteArrayColumnIndex) NullPage(int) bool   { return false }
   205  func (i byteArrayColumnIndex) MinValue(int) Value  { return makeValueBytes(ByteArray, i.page.min()) }
   206  func (i byteArrayColumnIndex) MaxValue(int) Value  { return makeValueBytes(ByteArray, i.page.max()) }
   207  func (i byteArrayColumnIndex) IsAscending() bool   { return false }
   208  func (i byteArrayColumnIndex) IsDescending() bool  { return false }
   209  
   210  type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage }
   211  
   212  func (i fixedLenByteArrayColumnIndex) NumPages() int       { return 1 }
   213  func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 }
   214  func (i fixedLenByteArrayColumnIndex) NullPage(int) bool   { return false }
   215  func (i fixedLenByteArrayColumnIndex) MinValue(int) Value {
   216  	return makeValueBytes(FixedLenByteArray, i.page.min())
   217  }
   218  func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value {
   219  	return makeValueBytes(FixedLenByteArray, i.page.max())
   220  }
   221  func (i fixedLenByteArrayColumnIndex) IsAscending() bool  { return false }
   222  func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false }
   223  
   224  type uint32ColumnIndex struct{ page *uint32Page }
   225  
   226  func (i uint32ColumnIndex) NumPages() int       { return 1 }
   227  func (i uint32ColumnIndex) NullCount(int) int64 { return 0 }
   228  func (i uint32ColumnIndex) NullPage(int) bool   { return false }
   229  func (i uint32ColumnIndex) MinValue(int) Value  { return makeValueUint32(i.page.min()) }
   230  func (i uint32ColumnIndex) MaxValue(int) Value  { return makeValueUint32(i.page.max()) }
   231  func (i uint32ColumnIndex) IsAscending() bool   { return false }
   232  func (i uint32ColumnIndex) IsDescending() bool  { return false }
   233  
   234  type uint64ColumnIndex struct{ page *uint64Page }
   235  
   236  func (i uint64ColumnIndex) NumPages() int       { return 1 }
   237  func (i uint64ColumnIndex) NullCount(int) int64 { return 0 }
   238  func (i uint64ColumnIndex) NullPage(int) bool   { return false }
   239  func (i uint64ColumnIndex) MinValue(int) Value  { return makeValueUint64(i.page.min()) }
   240  func (i uint64ColumnIndex) MaxValue(int) Value  { return makeValueUint64(i.page.max()) }
   241  func (i uint64ColumnIndex) IsAscending() bool   { return false }
   242  func (i uint64ColumnIndex) IsDescending() bool  { return false }
   243  
   244  type be128ColumnIndex struct{ page *be128Page }
   245  
   246  func (i be128ColumnIndex) NumPages() int       { return 1 }
   247  func (i be128ColumnIndex) NullCount(int) int64 { return 0 }
   248  func (i be128ColumnIndex) NullPage(int) bool   { return false }
   249  func (i be128ColumnIndex) MinValue(int) Value  { return makeValueBytes(FixedLenByteArray, i.page.min()) }
   250  func (i be128ColumnIndex) MaxValue(int) Value  { return makeValueBytes(FixedLenByteArray, i.page.max()) }
   251  func (i be128ColumnIndex) IsAscending() bool   { return false }
   252  func (i be128ColumnIndex) IsDescending() bool  { return false }
   253  
   254  // The ColumnIndexer interface is implemented by types that support generating
   255  // parquet column indexes.
   256  //
   257  // The package does not export any types that implement this interface, programs
   258  // must call NewColumnIndexer on a Type instance to construct column indexers.
   259  type ColumnIndexer interface {
   260  	// Resets the column indexer state.
   261  	Reset()
   262  
   263  	// Add a page to the column indexer.
   264  	IndexPage(numValues, numNulls int64, min, max Value)
   265  
   266  	// Generates a format.ColumnIndex value from the current state of the
   267  	// column indexer.
   268  	//
   269  	// The returned value may reference internal buffers, in which case the
   270  	// values remain valid until the next call to IndexPage or Reset on the
   271  	// column indexer.
   272  	ColumnIndex() format.ColumnIndex
   273  }
   274  
   275  type baseColumnIndexer struct {
   276  	nullPages  []bool
   277  	nullCounts []int64
   278  }
   279  
   280  func (i *baseColumnIndexer) reset() {
   281  	i.nullPages = i.nullPages[:0]
   282  	i.nullCounts = i.nullCounts[:0]
   283  }
   284  
   285  func (i *baseColumnIndexer) observe(numValues, numNulls int64) {
   286  	i.nullPages = append(i.nullPages, numValues == numNulls)
   287  	i.nullCounts = append(i.nullCounts, numNulls)
   288  }
   289  
   290  func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex {
   291  	nullPages := make([]bool, len(i.nullPages))
   292  	copy(nullPages, i.nullPages)
   293  	nullCounts := make([]int64, len(i.nullCounts))
   294  	copy(nullCounts, i.nullCounts)
   295  	return format.ColumnIndex{
   296  		NullPages:     nullPages,
   297  		NullCounts:    nullCounts,
   298  		MinValues:     minValues,
   299  		MaxValues:     maxValues,
   300  		BoundaryOrder: boundaryOrderOf(minOrder, maxOrder),
   301  	}
   302  }
   303  
   304  type booleanColumnIndexer struct {
   305  	baseColumnIndexer
   306  	minValues []bool
   307  	maxValues []bool
   308  }
   309  
   310  func newBooleanColumnIndexer() *booleanColumnIndexer {
   311  	return new(booleanColumnIndexer)
   312  }
   313  
   314  func (i *booleanColumnIndexer) Reset() {
   315  	i.reset()
   316  	i.minValues = i.minValues[:0]
   317  	i.maxValues = i.maxValues[:0]
   318  }
   319  
   320  func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   321  	i.observe(numValues, numNulls)
   322  	i.minValues = append(i.minValues, min.boolean())
   323  	i.maxValues = append(i.maxValues, max.boolean())
   324  }
   325  
   326  func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex {
   327  	return i.columnIndex(
   328  		splitFixedLenByteArrays(unsafecast.BoolToBytes(i.minValues), 1),
   329  		splitFixedLenByteArrays(unsafecast.BoolToBytes(i.maxValues), 1),
   330  		orderOfBool(i.minValues),
   331  		orderOfBool(i.maxValues),
   332  	)
   333  }
   334  
   335  type int32ColumnIndexer struct {
   336  	baseColumnIndexer
   337  	minValues []int32
   338  	maxValues []int32
   339  }
   340  
   341  func newInt32ColumnIndexer() *int32ColumnIndexer {
   342  	return new(int32ColumnIndexer)
   343  }
   344  
   345  func (i *int32ColumnIndexer) Reset() {
   346  	i.reset()
   347  	i.minValues = i.minValues[:0]
   348  	i.maxValues = i.maxValues[:0]
   349  }
   350  
   351  func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   352  	i.observe(numValues, numNulls)
   353  	i.minValues = append(i.minValues, min.int32())
   354  	i.maxValues = append(i.maxValues, max.int32())
   355  }
   356  
   357  func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex {
   358  	return i.columnIndex(
   359  		splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.minValues), 4),
   360  		splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.maxValues), 4),
   361  		orderOfInt32(i.minValues),
   362  		orderOfInt32(i.maxValues),
   363  	)
   364  }
   365  
   366  type int64ColumnIndexer struct {
   367  	baseColumnIndexer
   368  	minValues []int64
   369  	maxValues []int64
   370  }
   371  
   372  func newInt64ColumnIndexer() *int64ColumnIndexer {
   373  	return new(int64ColumnIndexer)
   374  }
   375  
   376  func (i *int64ColumnIndexer) Reset() {
   377  	i.reset()
   378  	i.minValues = i.minValues[:0]
   379  	i.maxValues = i.maxValues[:0]
   380  }
   381  
   382  func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   383  	i.observe(numValues, numNulls)
   384  	i.minValues = append(i.minValues, min.int64())
   385  	i.maxValues = append(i.maxValues, max.int64())
   386  }
   387  
   388  func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex {
   389  	return i.columnIndex(
   390  		splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.minValues), 8),
   391  		splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.maxValues), 8),
   392  		orderOfInt64(i.minValues),
   393  		orderOfInt64(i.maxValues),
   394  	)
   395  }
   396  
   397  type int96ColumnIndexer struct {
   398  	baseColumnIndexer
   399  	minValues []deprecated.Int96
   400  	maxValues []deprecated.Int96
   401  }
   402  
   403  func newInt96ColumnIndexer() *int96ColumnIndexer {
   404  	return new(int96ColumnIndexer)
   405  }
   406  
   407  func (i *int96ColumnIndexer) Reset() {
   408  	i.reset()
   409  	i.minValues = i.minValues[:0]
   410  	i.maxValues = i.maxValues[:0]
   411  }
   412  
   413  func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   414  	i.observe(numValues, numNulls)
   415  	i.minValues = append(i.minValues, min.Int96())
   416  	i.maxValues = append(i.maxValues, max.Int96())
   417  }
   418  
   419  func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex {
   420  	return i.columnIndex(
   421  		splitFixedLenByteArrays(deprecated.Int96ToBytes(i.minValues), 12),
   422  		splitFixedLenByteArrays(deprecated.Int96ToBytes(i.maxValues), 12),
   423  		deprecated.OrderOfInt96(i.minValues),
   424  		deprecated.OrderOfInt96(i.maxValues),
   425  	)
   426  }
   427  
   428  type floatColumnIndexer struct {
   429  	baseColumnIndexer
   430  	minValues []float32
   431  	maxValues []float32
   432  }
   433  
   434  func newFloatColumnIndexer() *floatColumnIndexer {
   435  	return new(floatColumnIndexer)
   436  }
   437  
   438  func (i *floatColumnIndexer) Reset() {
   439  	i.reset()
   440  	i.minValues = i.minValues[:0]
   441  	i.maxValues = i.maxValues[:0]
   442  }
   443  
   444  func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   445  	i.observe(numValues, numNulls)
   446  	i.minValues = append(i.minValues, min.float())
   447  	i.maxValues = append(i.maxValues, max.float())
   448  }
   449  
   450  func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex {
   451  	return i.columnIndex(
   452  		splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.minValues), 4),
   453  		splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.maxValues), 4),
   454  		orderOfFloat32(i.minValues),
   455  		orderOfFloat32(i.maxValues),
   456  	)
   457  }
   458  
   459  type doubleColumnIndexer struct {
   460  	baseColumnIndexer
   461  	minValues []float64
   462  	maxValues []float64
   463  }
   464  
   465  func newDoubleColumnIndexer() *doubleColumnIndexer {
   466  	return new(doubleColumnIndexer)
   467  }
   468  
   469  func (i *doubleColumnIndexer) Reset() {
   470  	i.reset()
   471  	i.minValues = i.minValues[:0]
   472  	i.maxValues = i.maxValues[:0]
   473  }
   474  
   475  func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   476  	i.observe(numValues, numNulls)
   477  	i.minValues = append(i.minValues, min.double())
   478  	i.maxValues = append(i.maxValues, max.double())
   479  }
   480  
   481  func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex {
   482  	return i.columnIndex(
   483  		splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.minValues), 8),
   484  		splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.maxValues), 8),
   485  		orderOfFloat64(i.minValues),
   486  		orderOfFloat64(i.maxValues),
   487  	)
   488  }
   489  
   490  type byteArrayColumnIndexer struct {
   491  	baseColumnIndexer
   492  	sizeLimit int
   493  	minValues []byte
   494  	maxValues []byte
   495  }
   496  
   497  func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer {
   498  	return &byteArrayColumnIndexer{sizeLimit: sizeLimit}
   499  }
   500  
   501  func (i *byteArrayColumnIndexer) Reset() {
   502  	i.reset()
   503  	i.minValues = i.minValues[:0]
   504  	i.maxValues = i.maxValues[:0]
   505  }
   506  
   507  func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   508  	i.observe(numValues, numNulls)
   509  	i.minValues = plain.AppendByteArray(i.minValues, min.byteArray())
   510  	i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray())
   511  }
   512  
   513  func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex {
   514  	minValues := splitByteArrays(i.minValues)
   515  	maxValues := splitByteArrays(i.maxValues)
   516  	if sizeLimit := i.sizeLimit; sizeLimit > 0 {
   517  		for i, v := range minValues {
   518  			minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit)
   519  		}
   520  		for i, v := range maxValues {
   521  			maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit)
   522  		}
   523  	}
   524  	return i.columnIndex(
   525  		minValues,
   526  		maxValues,
   527  		orderOfBytes(minValues),
   528  		orderOfBytes(maxValues),
   529  	)
   530  }
   531  
   532  type fixedLenByteArrayColumnIndexer struct {
   533  	baseColumnIndexer
   534  	size      int
   535  	sizeLimit int
   536  	minValues []byte
   537  	maxValues []byte
   538  }
   539  
   540  func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer {
   541  	return &fixedLenByteArrayColumnIndexer{
   542  		size:      size,
   543  		sizeLimit: sizeLimit,
   544  	}
   545  }
   546  
   547  func (i *fixedLenByteArrayColumnIndexer) Reset() {
   548  	i.reset()
   549  	i.minValues = i.minValues[:0]
   550  	i.maxValues = i.maxValues[:0]
   551  }
   552  
   553  func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   554  	i.observe(numValues, numNulls)
   555  	i.minValues = append(i.minValues, min.byteArray()...)
   556  	i.maxValues = append(i.maxValues, max.byteArray()...)
   557  }
   558  
   559  func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex {
   560  	minValues := splitFixedLenByteArrays(i.minValues, i.size)
   561  	maxValues := splitFixedLenByteArrays(i.maxValues, i.size)
   562  	if sizeLimit := i.sizeLimit; sizeLimit > 0 {
   563  		for i, v := range minValues {
   564  			minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit)
   565  		}
   566  		for i, v := range maxValues {
   567  			maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit)
   568  		}
   569  	}
   570  	return i.columnIndex(
   571  		minValues,
   572  		maxValues,
   573  		orderOfBytes(minValues),
   574  		orderOfBytes(maxValues),
   575  	)
   576  }
   577  
   578  type uint32ColumnIndexer struct {
   579  	baseColumnIndexer
   580  	minValues []uint32
   581  	maxValues []uint32
   582  }
   583  
   584  func newUint32ColumnIndexer() *uint32ColumnIndexer {
   585  	return new(uint32ColumnIndexer)
   586  }
   587  
   588  func (i *uint32ColumnIndexer) Reset() {
   589  	i.reset()
   590  	i.minValues = i.minValues[:0]
   591  	i.maxValues = i.maxValues[:0]
   592  }
   593  
   594  func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   595  	i.observe(numValues, numNulls)
   596  	i.minValues = append(i.minValues, min.uint32())
   597  	i.maxValues = append(i.maxValues, max.uint32())
   598  }
   599  
   600  func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex {
   601  	return i.columnIndex(
   602  		splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.minValues), 4),
   603  		splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.maxValues), 4),
   604  		orderOfUint32(i.minValues),
   605  		orderOfUint32(i.maxValues),
   606  	)
   607  }
   608  
   609  type uint64ColumnIndexer struct {
   610  	baseColumnIndexer
   611  	minValues []uint64
   612  	maxValues []uint64
   613  }
   614  
   615  func newUint64ColumnIndexer() *uint64ColumnIndexer {
   616  	return new(uint64ColumnIndexer)
   617  }
   618  
   619  func (i *uint64ColumnIndexer) Reset() {
   620  	i.reset()
   621  	i.minValues = i.minValues[:0]
   622  	i.maxValues = i.maxValues[:0]
   623  }
   624  
   625  func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   626  	i.observe(numValues, numNulls)
   627  	i.minValues = append(i.minValues, min.uint64())
   628  	i.maxValues = append(i.maxValues, max.uint64())
   629  }
   630  
   631  func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex {
   632  	return i.columnIndex(
   633  		splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.minValues), 8),
   634  		splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.maxValues), 8),
   635  		orderOfUint64(i.minValues),
   636  		orderOfUint64(i.maxValues),
   637  	)
   638  }
   639  
   640  type be128ColumnIndexer struct {
   641  	baseColumnIndexer
   642  	minValues [][16]byte
   643  	maxValues [][16]byte
   644  }
   645  
   646  func newBE128ColumnIndexer() *be128ColumnIndexer {
   647  	return new(be128ColumnIndexer)
   648  }
   649  
   650  func (i *be128ColumnIndexer) Reset() {
   651  	i.reset()
   652  	i.minValues = i.minValues[:0]
   653  	i.maxValues = i.maxValues[:0]
   654  }
   655  
   656  func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   657  	i.observe(numValues, numNulls)
   658  	if !min.IsNull() {
   659  		i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray()))
   660  	}
   661  	if !max.IsNull() {
   662  		i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray()))
   663  	}
   664  }
   665  
   666  func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex {
   667  	minValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.minValues), 16)
   668  	maxValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.maxValues), 16)
   669  	return i.columnIndex(
   670  		minValues,
   671  		maxValues,
   672  		orderOfBytes(minValues),
   673  		orderOfBytes(maxValues),
   674  	)
   675  }
   676  
   677  func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte {
   678  	if len(value) > sizeLimit {
   679  		value = value[:sizeLimit]
   680  	}
   681  	return value
   682  }
   683  
   684  // truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit.
   685  // If the given byte array is truncated, it is incremented by 1 in place.
   686  func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte {
   687  	if len(value) > sizeLimit {
   688  		value = value[:sizeLimit]
   689  		incrementByteArrayInplace(value)
   690  	}
   691  	return value
   692  }
   693  
   694  // incrementByteArray increments the given byte array by 1.
   695  // Reference: https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124
   696  func incrementByteArrayInplace(value []byte) {
   697  	for i := len(value) - 1; i >= 0; i-- {
   698  		value[i]++
   699  		if value[i] != 0 { // Did not overflow: 0xFF -> 0x00
   700  			return
   701  		}
   702  	}
   703  	// Fully overflowed, so restore all to 0xFF
   704  	for i := range value {
   705  		value[i] = 0xFF
   706  	}
   707  }
   708  
   709  func splitByteArrays(data []byte) [][]byte {
   710  	length := 0
   711  	plain.RangeByteArray(data, func([]byte) error {
   712  		length++
   713  		return nil
   714  	})
   715  	buffer := make([]byte, 0, len(data)-(4*length))
   716  	values := make([][]byte, 0, length)
   717  	plain.RangeByteArray(data, func(value []byte) error {
   718  		offset := len(buffer)
   719  		buffer = append(buffer, value...)
   720  		values = append(values, buffer[offset:])
   721  		return nil
   722  	})
   723  	return values
   724  }
   725  
   726  func splitFixedLenByteArrays(data []byte, size int) [][]byte {
   727  	data = copyBytes(data)
   728  	values := make([][]byte, len(data)/size)
   729  	for i := range values {
   730  		j := (i + 0) * size
   731  		k := (i + 1) * size
   732  		values[i] = data[j:k:k]
   733  	}
   734  	return values
   735  }
   736  
   737  func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder {
   738  	if minOrder == maxOrder {
   739  		switch {
   740  		case minOrder > 0:
   741  			return format.Ascending
   742  		case minOrder < 0:
   743  			return format.Descending
   744  		}
   745  	}
   746  	return format.Unordered
   747  }