github.com/segmentio/parquet-go@v0.0.0-20230712180008-5d42db8f0d47/column_index.go (about)

     1  package parquet
     2  
     3  import (
     4  	"github.com/segmentio/parquet-go/deprecated"
     5  	"github.com/segmentio/parquet-go/encoding/plain"
     6  	"github.com/segmentio/parquet-go/format"
     7  	"github.com/segmentio/parquet-go/internal/unsafecast"
     8  )
     9  
    10  type ColumnIndex interface {
    11  	// NumPages returns the number of paged in the column index.
    12  	NumPages() int
    13  
    14  	// Returns the number of null values in the page at the given index.
    15  	NullCount(int) int64
    16  
    17  	// Tells whether the page at the given index contains null values only.
    18  	NullPage(int) bool
    19  
    20  	// PageIndex return min/max bounds for the page at the given index in the
    21  	// column.
    22  	MinValue(int) Value
    23  	MaxValue(int) Value
    24  
    25  	// IsAscending returns true if the column index min/max values are sorted
    26  	// in ascending order (based on the ordering rules of the column's logical
    27  	// type).
    28  	IsAscending() bool
    29  
    30  	// IsDescending returns true if the column index min/max values are sorted
    31  	// in descending order (based on the ordering rules of the column's logical
    32  	// type).
    33  	IsDescending() bool
    34  }
    35  
    36  // NewColumnIndex constructs a ColumnIndex instance from the given parquet
    37  // format column index. The kind argument configures the type of values
    38  func NewColumnIndex(kind Kind, index *format.ColumnIndex) ColumnIndex {
    39  	return &formatColumnIndex{
    40  		kind:  kind,
    41  		index: index,
    42  	}
    43  }
    44  
    45  type formatColumnIndex struct {
    46  	kind  Kind
    47  	index *format.ColumnIndex
    48  }
    49  
    50  func (f *formatColumnIndex) NumPages() int {
    51  	return len(f.index.MinValues)
    52  }
    53  
    54  func (f *formatColumnIndex) NullCount(i int) int64 {
    55  	if len(f.index.NullCounts) > 0 {
    56  		return f.index.NullCounts[i]
    57  	}
    58  	return 0
    59  }
    60  
    61  func (f *formatColumnIndex) NullPage(i int) bool {
    62  	return len(f.index.NullPages) > 0 && f.index.NullPages[i]
    63  }
    64  
    65  func (f *formatColumnIndex) MinValue(i int) Value {
    66  	if f.NullPage(i) {
    67  		return Value{}
    68  	}
    69  	return f.kind.Value(f.index.MinValues[i])
    70  }
    71  
    72  func (f *formatColumnIndex) MaxValue(i int) Value {
    73  	if f.NullPage(i) {
    74  		return Value{}
    75  	}
    76  	return f.kind.Value(f.index.MaxValues[i])
    77  }
    78  
    79  func (f *formatColumnIndex) IsAscending() bool {
    80  	return f.index.BoundaryOrder == format.Ascending
    81  }
    82  
    83  func (f *formatColumnIndex) IsDescending() bool {
    84  	return f.index.BoundaryOrder == format.Descending
    85  }
    86  
    87  type fileColumnIndex struct{ chunk *fileColumnChunk }
    88  
    89  func (i fileColumnIndex) NumPages() int {
    90  	return len(i.chunk.columnIndex.NullPages)
    91  }
    92  
    93  func (i fileColumnIndex) NullCount(j int) int64 {
    94  	if len(i.chunk.columnIndex.NullCounts) > 0 {
    95  		return i.chunk.columnIndex.NullCounts[j]
    96  	}
    97  	return 0
    98  }
    99  
   100  func (i fileColumnIndex) NullPage(j int) bool {
   101  	return len(i.chunk.columnIndex.NullPages) > 0 && i.chunk.columnIndex.NullPages[j]
   102  }
   103  
   104  func (i fileColumnIndex) MinValue(j int) Value {
   105  	if i.NullPage(j) {
   106  		return Value{}
   107  	}
   108  	return i.makeValue(i.chunk.columnIndex.MinValues[j])
   109  }
   110  
   111  func (i fileColumnIndex) MaxValue(j int) Value {
   112  	if i.NullPage(j) {
   113  		return Value{}
   114  	}
   115  	return i.makeValue(i.chunk.columnIndex.MaxValues[j])
   116  }
   117  
   118  func (i fileColumnIndex) IsAscending() bool {
   119  	return i.chunk.columnIndex.BoundaryOrder == format.Ascending
   120  }
   121  
   122  func (i fileColumnIndex) IsDescending() bool {
   123  	return i.chunk.columnIndex.BoundaryOrder == format.Descending
   124  }
   125  
   126  func (i *fileColumnIndex) makeValue(b []byte) Value {
   127  	return i.chunk.column.typ.Kind().Value(b)
   128  }
   129  
   130  type emptyColumnIndex struct{}
   131  
   132  func (emptyColumnIndex) NumPages() int       { return 0 }
   133  func (emptyColumnIndex) NullCount(int) int64 { return 0 }
   134  func (emptyColumnIndex) NullPage(int) bool   { return false }
   135  func (emptyColumnIndex) MinValue(int) Value  { return Value{} }
   136  func (emptyColumnIndex) MaxValue(int) Value  { return Value{} }
   137  func (emptyColumnIndex) IsAscending() bool   { return false }
   138  func (emptyColumnIndex) IsDescending() bool  { return false }
   139  
   140  type booleanColumnIndex struct{ page *booleanPage }
   141  
   142  func (i booleanColumnIndex) NumPages() int       { return 1 }
   143  func (i booleanColumnIndex) NullCount(int) int64 { return 0 }
   144  func (i booleanColumnIndex) NullPage(int) bool   { return false }
   145  func (i booleanColumnIndex) MinValue(int) Value  { return makeValueBoolean(i.page.min()) }
   146  func (i booleanColumnIndex) MaxValue(int) Value  { return makeValueBoolean(i.page.max()) }
   147  func (i booleanColumnIndex) IsAscending() bool   { return false }
   148  func (i booleanColumnIndex) IsDescending() bool  { return false }
   149  
   150  type int32ColumnIndex struct{ page *int32Page }
   151  
   152  func (i int32ColumnIndex) NumPages() int       { return 1 }
   153  func (i int32ColumnIndex) NullCount(int) int64 { return 0 }
   154  func (i int32ColumnIndex) NullPage(int) bool   { return false }
   155  func (i int32ColumnIndex) MinValue(int) Value  { return makeValueInt32(i.page.min()) }
   156  func (i int32ColumnIndex) MaxValue(int) Value  { return makeValueInt32(i.page.max()) }
   157  func (i int32ColumnIndex) IsAscending() bool   { return false }
   158  func (i int32ColumnIndex) IsDescending() bool  { return false }
   159  
   160  type int64ColumnIndex struct{ page *int64Page }
   161  
   162  func (i int64ColumnIndex) NumPages() int       { return 1 }
   163  func (i int64ColumnIndex) NullCount(int) int64 { return 0 }
   164  func (i int64ColumnIndex) NullPage(int) bool   { return false }
   165  func (i int64ColumnIndex) MinValue(int) Value  { return makeValueInt64(i.page.min()) }
   166  func (i int64ColumnIndex) MaxValue(int) Value  { return makeValueInt64(i.page.max()) }
   167  func (i int64ColumnIndex) IsAscending() bool   { return false }
   168  func (i int64ColumnIndex) IsDescending() bool  { return false }
   169  
   170  type int96ColumnIndex struct{ page *int96Page }
   171  
   172  func (i int96ColumnIndex) NumPages() int       { return 1 }
   173  func (i int96ColumnIndex) NullCount(int) int64 { return 0 }
   174  func (i int96ColumnIndex) NullPage(int) bool   { return false }
   175  func (i int96ColumnIndex) MinValue(int) Value  { return makeValueInt96(i.page.min()) }
   176  func (i int96ColumnIndex) MaxValue(int) Value  { return makeValueInt96(i.page.max()) }
   177  func (i int96ColumnIndex) IsAscending() bool   { return false }
   178  func (i int96ColumnIndex) IsDescending() bool  { return false }
   179  
   180  type floatColumnIndex struct{ page *floatPage }
   181  
   182  func (i floatColumnIndex) NumPages() int       { return 1 }
   183  func (i floatColumnIndex) NullCount(int) int64 { return 0 }
   184  func (i floatColumnIndex) NullPage(int) bool   { return false }
   185  func (i floatColumnIndex) MinValue(int) Value  { return makeValueFloat(i.page.min()) }
   186  func (i floatColumnIndex) MaxValue(int) Value  { return makeValueFloat(i.page.max()) }
   187  func (i floatColumnIndex) IsAscending() bool   { return false }
   188  func (i floatColumnIndex) IsDescending() bool  { return false }
   189  
   190  type doubleColumnIndex struct{ page *doublePage }
   191  
   192  func (i doubleColumnIndex) NumPages() int       { return 1 }
   193  func (i doubleColumnIndex) NullCount(int) int64 { return 0 }
   194  func (i doubleColumnIndex) NullPage(int) bool   { return false }
   195  func (i doubleColumnIndex) MinValue(int) Value  { return makeValueDouble(i.page.min()) }
   196  func (i doubleColumnIndex) MaxValue(int) Value  { return makeValueDouble(i.page.max()) }
   197  func (i doubleColumnIndex) IsAscending() bool   { return false }
   198  func (i doubleColumnIndex) IsDescending() bool  { return false }
   199  
   200  type byteArrayColumnIndex struct{ page *byteArrayPage }
   201  
   202  func (i byteArrayColumnIndex) NumPages() int       { return 1 }
   203  func (i byteArrayColumnIndex) NullCount(int) int64 { return 0 }
   204  func (i byteArrayColumnIndex) NullPage(int) bool   { return false }
   205  func (i byteArrayColumnIndex) MinValue(int) Value  { return makeValueBytes(ByteArray, i.page.min()) }
   206  func (i byteArrayColumnIndex) MaxValue(int) Value  { return makeValueBytes(ByteArray, i.page.max()) }
   207  func (i byteArrayColumnIndex) IsAscending() bool   { return false }
   208  func (i byteArrayColumnIndex) IsDescending() bool  { return false }
   209  
   210  type fixedLenByteArrayColumnIndex struct{ page *fixedLenByteArrayPage }
   211  
   212  func (i fixedLenByteArrayColumnIndex) NumPages() int       { return 1 }
   213  func (i fixedLenByteArrayColumnIndex) NullCount(int) int64 { return 0 }
   214  func (i fixedLenByteArrayColumnIndex) NullPage(int) bool   { return false }
   215  func (i fixedLenByteArrayColumnIndex) MinValue(int) Value {
   216  	return makeValueBytes(FixedLenByteArray, i.page.min())
   217  }
   218  func (i fixedLenByteArrayColumnIndex) MaxValue(int) Value {
   219  	return makeValueBytes(FixedLenByteArray, i.page.max())
   220  }
   221  func (i fixedLenByteArrayColumnIndex) IsAscending() bool  { return false }
   222  func (i fixedLenByteArrayColumnIndex) IsDescending() bool { return false }
   223  
   224  type uint32ColumnIndex struct{ page *uint32Page }
   225  
   226  func (i uint32ColumnIndex) NumPages() int       { return 1 }
   227  func (i uint32ColumnIndex) NullCount(int) int64 { return 0 }
   228  func (i uint32ColumnIndex) NullPage(int) bool   { return false }
   229  func (i uint32ColumnIndex) MinValue(int) Value  { return makeValueUint32(i.page.min()) }
   230  func (i uint32ColumnIndex) MaxValue(int) Value  { return makeValueUint32(i.page.max()) }
   231  func (i uint32ColumnIndex) IsAscending() bool   { return false }
   232  func (i uint32ColumnIndex) IsDescending() bool  { return false }
   233  
   234  type uint64ColumnIndex struct{ page *uint64Page }
   235  
   236  func (i uint64ColumnIndex) NumPages() int       { return 1 }
   237  func (i uint64ColumnIndex) NullCount(int) int64 { return 0 }
   238  func (i uint64ColumnIndex) NullPage(int) bool   { return false }
   239  func (i uint64ColumnIndex) MinValue(int) Value  { return makeValueUint64(i.page.min()) }
   240  func (i uint64ColumnIndex) MaxValue(int) Value  { return makeValueUint64(i.page.max()) }
   241  func (i uint64ColumnIndex) IsAscending() bool   { return false }
   242  func (i uint64ColumnIndex) IsDescending() bool  { return false }
   243  
   244  type be128ColumnIndex struct{ page *be128Page }
   245  
   246  func (i be128ColumnIndex) NumPages() int       { return 1 }
   247  func (i be128ColumnIndex) NullCount(int) int64 { return 0 }
   248  func (i be128ColumnIndex) NullPage(int) bool   { return false }
   249  func (i be128ColumnIndex) MinValue(int) Value  { return makeValueBytes(FixedLenByteArray, i.page.min()) }
   250  func (i be128ColumnIndex) MaxValue(int) Value  { return makeValueBytes(FixedLenByteArray, i.page.max()) }
   251  func (i be128ColumnIndex) IsAscending() bool   { return false }
   252  func (i be128ColumnIndex) IsDescending() bool  { return false }
   253  
   254  // The ColumnIndexer interface is implemented by types that support generating
   255  // parquet column indexes.
   256  //
   257  // The package does not export any types that implement this interface, programs
   258  // must call NewColumnIndexer on a Type instance to construct column indexers.
   259  type ColumnIndexer interface {
   260  	// Resets the column indexer state.
   261  	Reset()
   262  
   263  	// Add a page to the column indexer.
   264  	IndexPage(numValues, numNulls int64, min, max Value)
   265  
   266  	// Generates a format.ColumnIndex value from the current state of the
   267  	// column indexer.
   268  	//
   269  	// The returned value may reference internal buffers, in which case the
   270  	// values remain valid until the next call to IndexPage or Reset on the
   271  	// column indexer.
   272  	ColumnIndex() format.ColumnIndex
   273  }
   274  
   275  type baseColumnIndexer struct {
   276  	nullPages  []bool
   277  	nullCounts []int64
   278  }
   279  
   280  func (i *baseColumnIndexer) reset() {
   281  	i.nullPages = i.nullPages[:0]
   282  	i.nullCounts = i.nullCounts[:0]
   283  }
   284  
   285  func (i *baseColumnIndexer) observe(numValues, numNulls int64) {
   286  	i.nullPages = append(i.nullPages, numValues == numNulls)
   287  	i.nullCounts = append(i.nullCounts, numNulls)
   288  }
   289  
   290  func (i *baseColumnIndexer) columnIndex(minValues, maxValues [][]byte, minOrder, maxOrder int) format.ColumnIndex {
   291  	return format.ColumnIndex{
   292  		NullPages:     i.nullPages,
   293  		NullCounts:    i.nullCounts,
   294  		MinValues:     minValues,
   295  		MaxValues:     maxValues,
   296  		BoundaryOrder: boundaryOrderOf(minOrder, maxOrder),
   297  	}
   298  }
   299  
   300  type booleanColumnIndexer struct {
   301  	baseColumnIndexer
   302  	minValues []bool
   303  	maxValues []bool
   304  }
   305  
   306  func newBooleanColumnIndexer() *booleanColumnIndexer {
   307  	return new(booleanColumnIndexer)
   308  }
   309  
   310  func (i *booleanColumnIndexer) Reset() {
   311  	i.reset()
   312  	i.minValues = i.minValues[:0]
   313  	i.maxValues = i.maxValues[:0]
   314  }
   315  
   316  func (i *booleanColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   317  	i.observe(numValues, numNulls)
   318  	i.minValues = append(i.minValues, min.boolean())
   319  	i.maxValues = append(i.maxValues, max.boolean())
   320  }
   321  
   322  func (i *booleanColumnIndexer) ColumnIndex() format.ColumnIndex {
   323  	return i.columnIndex(
   324  		splitFixedLenByteArrays(unsafecast.BoolToBytes(i.minValues), 1),
   325  		splitFixedLenByteArrays(unsafecast.BoolToBytes(i.maxValues), 1),
   326  		orderOfBool(i.minValues),
   327  		orderOfBool(i.maxValues),
   328  	)
   329  }
   330  
   331  type int32ColumnIndexer struct {
   332  	baseColumnIndexer
   333  	minValues []int32
   334  	maxValues []int32
   335  }
   336  
   337  func newInt32ColumnIndexer() *int32ColumnIndexer {
   338  	return new(int32ColumnIndexer)
   339  }
   340  
   341  func (i *int32ColumnIndexer) Reset() {
   342  	i.reset()
   343  	i.minValues = i.minValues[:0]
   344  	i.maxValues = i.maxValues[:0]
   345  }
   346  
   347  func (i *int32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   348  	i.observe(numValues, numNulls)
   349  	i.minValues = append(i.minValues, min.int32())
   350  	i.maxValues = append(i.maxValues, max.int32())
   351  }
   352  
   353  func (i *int32ColumnIndexer) ColumnIndex() format.ColumnIndex {
   354  	return i.columnIndex(
   355  		splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.minValues), 4),
   356  		splitFixedLenByteArrays(unsafecast.Int32ToBytes(i.maxValues), 4),
   357  		orderOfInt32(i.minValues),
   358  		orderOfInt32(i.maxValues),
   359  	)
   360  }
   361  
   362  type int64ColumnIndexer struct {
   363  	baseColumnIndexer
   364  	minValues []int64
   365  	maxValues []int64
   366  }
   367  
   368  func newInt64ColumnIndexer() *int64ColumnIndexer {
   369  	return new(int64ColumnIndexer)
   370  }
   371  
   372  func (i *int64ColumnIndexer) Reset() {
   373  	i.reset()
   374  	i.minValues = i.minValues[:0]
   375  	i.maxValues = i.maxValues[:0]
   376  }
   377  
   378  func (i *int64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   379  	i.observe(numValues, numNulls)
   380  	i.minValues = append(i.minValues, min.int64())
   381  	i.maxValues = append(i.maxValues, max.int64())
   382  }
   383  
   384  func (i *int64ColumnIndexer) ColumnIndex() format.ColumnIndex {
   385  	return i.columnIndex(
   386  		splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.minValues), 8),
   387  		splitFixedLenByteArrays(unsafecast.Int64ToBytes(i.maxValues), 8),
   388  		orderOfInt64(i.minValues),
   389  		orderOfInt64(i.maxValues),
   390  	)
   391  }
   392  
   393  type int96ColumnIndexer struct {
   394  	baseColumnIndexer
   395  	minValues []deprecated.Int96
   396  	maxValues []deprecated.Int96
   397  }
   398  
   399  func newInt96ColumnIndexer() *int96ColumnIndexer {
   400  	return new(int96ColumnIndexer)
   401  }
   402  
   403  func (i *int96ColumnIndexer) Reset() {
   404  	i.reset()
   405  	i.minValues = i.minValues[:0]
   406  	i.maxValues = i.maxValues[:0]
   407  }
   408  
   409  func (i *int96ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   410  	i.observe(numValues, numNulls)
   411  	i.minValues = append(i.minValues, min.Int96())
   412  	i.maxValues = append(i.maxValues, max.Int96())
   413  }
   414  
   415  func (i *int96ColumnIndexer) ColumnIndex() format.ColumnIndex {
   416  	return i.columnIndex(
   417  		splitFixedLenByteArrays(deprecated.Int96ToBytes(i.minValues), 12),
   418  		splitFixedLenByteArrays(deprecated.Int96ToBytes(i.maxValues), 12),
   419  		deprecated.OrderOfInt96(i.minValues),
   420  		deprecated.OrderOfInt96(i.maxValues),
   421  	)
   422  }
   423  
   424  type floatColumnIndexer struct {
   425  	baseColumnIndexer
   426  	minValues []float32
   427  	maxValues []float32
   428  }
   429  
   430  func newFloatColumnIndexer() *floatColumnIndexer {
   431  	return new(floatColumnIndexer)
   432  }
   433  
   434  func (i *floatColumnIndexer) Reset() {
   435  	i.reset()
   436  	i.minValues = i.minValues[:0]
   437  	i.maxValues = i.maxValues[:0]
   438  }
   439  
   440  func (i *floatColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   441  	i.observe(numValues, numNulls)
   442  	i.minValues = append(i.minValues, min.float())
   443  	i.maxValues = append(i.maxValues, max.float())
   444  }
   445  
   446  func (i *floatColumnIndexer) ColumnIndex() format.ColumnIndex {
   447  	return i.columnIndex(
   448  		splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.minValues), 4),
   449  		splitFixedLenByteArrays(unsafecast.Float32ToBytes(i.maxValues), 4),
   450  		orderOfFloat32(i.minValues),
   451  		orderOfFloat32(i.maxValues),
   452  	)
   453  }
   454  
   455  type doubleColumnIndexer struct {
   456  	baseColumnIndexer
   457  	minValues []float64
   458  	maxValues []float64
   459  }
   460  
   461  func newDoubleColumnIndexer() *doubleColumnIndexer {
   462  	return new(doubleColumnIndexer)
   463  }
   464  
   465  func (i *doubleColumnIndexer) Reset() {
   466  	i.reset()
   467  	i.minValues = i.minValues[:0]
   468  	i.maxValues = i.maxValues[:0]
   469  }
   470  
   471  func (i *doubleColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   472  	i.observe(numValues, numNulls)
   473  	i.minValues = append(i.minValues, min.double())
   474  	i.maxValues = append(i.maxValues, max.double())
   475  }
   476  
   477  func (i *doubleColumnIndexer) ColumnIndex() format.ColumnIndex {
   478  	return i.columnIndex(
   479  		splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.minValues), 8),
   480  		splitFixedLenByteArrays(unsafecast.Float64ToBytes(i.maxValues), 8),
   481  		orderOfFloat64(i.minValues),
   482  		orderOfFloat64(i.maxValues),
   483  	)
   484  }
   485  
   486  type byteArrayColumnIndexer struct {
   487  	baseColumnIndexer
   488  	sizeLimit int
   489  	minValues []byte
   490  	maxValues []byte
   491  }
   492  
   493  func newByteArrayColumnIndexer(sizeLimit int) *byteArrayColumnIndexer {
   494  	return &byteArrayColumnIndexer{sizeLimit: sizeLimit}
   495  }
   496  
   497  func (i *byteArrayColumnIndexer) Reset() {
   498  	i.reset()
   499  	i.minValues = i.minValues[:0]
   500  	i.maxValues = i.maxValues[:0]
   501  }
   502  
   503  func (i *byteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   504  	i.observe(numValues, numNulls)
   505  	i.minValues = plain.AppendByteArray(i.minValues, min.byteArray())
   506  	i.maxValues = plain.AppendByteArray(i.maxValues, max.byteArray())
   507  }
   508  
   509  func (i *byteArrayColumnIndexer) ColumnIndex() format.ColumnIndex {
   510  	minValues := splitByteArrays(i.minValues)
   511  	maxValues := splitByteArrays(i.maxValues)
   512  	if sizeLimit := i.sizeLimit; sizeLimit > 0 {
   513  		for i, v := range minValues {
   514  			minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit)
   515  		}
   516  		for i, v := range maxValues {
   517  			maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit)
   518  		}
   519  	}
   520  	return i.columnIndex(
   521  		minValues,
   522  		maxValues,
   523  		orderOfBytes(minValues),
   524  		orderOfBytes(maxValues),
   525  	)
   526  }
   527  
   528  type fixedLenByteArrayColumnIndexer struct {
   529  	baseColumnIndexer
   530  	size      int
   531  	sizeLimit int
   532  	minValues []byte
   533  	maxValues []byte
   534  }
   535  
   536  func newFixedLenByteArrayColumnIndexer(size, sizeLimit int) *fixedLenByteArrayColumnIndexer {
   537  	return &fixedLenByteArrayColumnIndexer{
   538  		size:      size,
   539  		sizeLimit: sizeLimit,
   540  	}
   541  }
   542  
   543  func (i *fixedLenByteArrayColumnIndexer) Reset() {
   544  	i.reset()
   545  	i.minValues = i.minValues[:0]
   546  	i.maxValues = i.maxValues[:0]
   547  }
   548  
   549  func (i *fixedLenByteArrayColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   550  	i.observe(numValues, numNulls)
   551  	i.minValues = append(i.minValues, min.byteArray()...)
   552  	i.maxValues = append(i.maxValues, max.byteArray()...)
   553  }
   554  
   555  func (i *fixedLenByteArrayColumnIndexer) ColumnIndex() format.ColumnIndex {
   556  	minValues := splitFixedLenByteArrays(i.minValues, i.size)
   557  	maxValues := splitFixedLenByteArrays(i.maxValues, i.size)
   558  	if sizeLimit := i.sizeLimit; sizeLimit > 0 {
   559  		for i, v := range minValues {
   560  			minValues[i] = truncateLargeMinByteArrayValue(v, sizeLimit)
   561  		}
   562  		for i, v := range maxValues {
   563  			maxValues[i] = truncateLargeMaxByteArrayValue(v, sizeLimit)
   564  		}
   565  	}
   566  	return i.columnIndex(
   567  		minValues,
   568  		maxValues,
   569  		orderOfBytes(minValues),
   570  		orderOfBytes(maxValues),
   571  	)
   572  }
   573  
   574  type uint32ColumnIndexer struct {
   575  	baseColumnIndexer
   576  	minValues []uint32
   577  	maxValues []uint32
   578  }
   579  
   580  func newUint32ColumnIndexer() *uint32ColumnIndexer {
   581  	return new(uint32ColumnIndexer)
   582  }
   583  
   584  func (i *uint32ColumnIndexer) Reset() {
   585  	i.reset()
   586  	i.minValues = i.minValues[:0]
   587  	i.maxValues = i.maxValues[:0]
   588  }
   589  
   590  func (i *uint32ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   591  	i.observe(numValues, numNulls)
   592  	i.minValues = append(i.minValues, min.uint32())
   593  	i.maxValues = append(i.maxValues, max.uint32())
   594  }
   595  
   596  func (i *uint32ColumnIndexer) ColumnIndex() format.ColumnIndex {
   597  	return i.columnIndex(
   598  		splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.minValues), 4),
   599  		splitFixedLenByteArrays(unsafecast.Uint32ToBytes(i.maxValues), 4),
   600  		orderOfUint32(i.minValues),
   601  		orderOfUint32(i.maxValues),
   602  	)
   603  }
   604  
   605  type uint64ColumnIndexer struct {
   606  	baseColumnIndexer
   607  	minValues []uint64
   608  	maxValues []uint64
   609  }
   610  
   611  func newUint64ColumnIndexer() *uint64ColumnIndexer {
   612  	return new(uint64ColumnIndexer)
   613  }
   614  
   615  func (i *uint64ColumnIndexer) Reset() {
   616  	i.reset()
   617  	i.minValues = i.minValues[:0]
   618  	i.maxValues = i.maxValues[:0]
   619  }
   620  
   621  func (i *uint64ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   622  	i.observe(numValues, numNulls)
   623  	i.minValues = append(i.minValues, min.uint64())
   624  	i.maxValues = append(i.maxValues, max.uint64())
   625  }
   626  
   627  func (i *uint64ColumnIndexer) ColumnIndex() format.ColumnIndex {
   628  	return i.columnIndex(
   629  		splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.minValues), 8),
   630  		splitFixedLenByteArrays(unsafecast.Uint64ToBytes(i.maxValues), 8),
   631  		orderOfUint64(i.minValues),
   632  		orderOfUint64(i.maxValues),
   633  	)
   634  }
   635  
   636  type be128ColumnIndexer struct {
   637  	baseColumnIndexer
   638  	minValues [][16]byte
   639  	maxValues [][16]byte
   640  }
   641  
   642  func newBE128ColumnIndexer() *be128ColumnIndexer {
   643  	return new(be128ColumnIndexer)
   644  }
   645  
   646  func (i *be128ColumnIndexer) Reset() {
   647  	i.reset()
   648  	i.minValues = i.minValues[:0]
   649  	i.maxValues = i.maxValues[:0]
   650  }
   651  
   652  func (i *be128ColumnIndexer) IndexPage(numValues, numNulls int64, min, max Value) {
   653  	i.observe(numValues, numNulls)
   654  	if !min.IsNull() {
   655  		i.minValues = append(i.minValues, *(*[16]byte)(min.byteArray()))
   656  	}
   657  	if !max.IsNull() {
   658  		i.maxValues = append(i.maxValues, *(*[16]byte)(max.byteArray()))
   659  	}
   660  }
   661  
   662  func (i *be128ColumnIndexer) ColumnIndex() format.ColumnIndex {
   663  	minValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.minValues), 16)
   664  	maxValues := splitFixedLenByteArrays(unsafecast.Uint128ToBytes(i.maxValues), 16)
   665  	return i.columnIndex(
   666  		minValues,
   667  		maxValues,
   668  		orderOfBytes(minValues),
   669  		orderOfBytes(maxValues),
   670  	)
   671  }
   672  
   673  func truncateLargeMinByteArrayValue(value []byte, sizeLimit int) []byte {
   674  	if len(value) > sizeLimit {
   675  		value = value[:sizeLimit]
   676  	}
   677  	return value
   678  }
   679  
   680  // truncateLargeMaxByteArrayValue truncates the given byte array to the given size limit.
   681  // If the given byte array is truncated, it is incremented by 1 in place.
   682  func truncateLargeMaxByteArrayValue(value []byte, sizeLimit int) []byte {
   683  	if len(value) > sizeLimit {
   684  		value = value[:sizeLimit]
   685  		incrementByteArrayInplace(value)
   686  	}
   687  	return value
   688  }
   689  
   690  // incrementByteArray increments the given byte array by 1.
   691  // Reference: https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryTruncator.java#L124
   692  func incrementByteArrayInplace(value []byte) {
   693  	for i := len(value) - 1; i >= 0; i-- {
   694  		value[i]++
   695  		if value[i] != 0 { // Did not overflow: 0xFF -> 0x00
   696  			return
   697  		}
   698  	}
   699  	// Fully overflowed, so restore all to 0xFF
   700  	for i := range value {
   701  		value[i] = 0xFF
   702  	}
   703  }
   704  
   705  func splitByteArrays(data []byte) [][]byte {
   706  	length := 0
   707  	plain.RangeByteArray(data, func([]byte) error {
   708  		length++
   709  		return nil
   710  	})
   711  	buffer := make([]byte, 0, len(data)-(4*length))
   712  	values := make([][]byte, 0, length)
   713  	plain.RangeByteArray(data, func(value []byte) error {
   714  		offset := len(buffer)
   715  		buffer = append(buffer, value...)
   716  		values = append(values, buffer[offset:])
   717  		return nil
   718  	})
   719  	return values
   720  }
   721  
   722  func splitFixedLenByteArrays(data []byte, size int) [][]byte {
   723  	data = copyBytes(data)
   724  	values := make([][]byte, len(data)/size)
   725  	for i := range values {
   726  		j := (i + 0) * size
   727  		k := (i + 1) * size
   728  		values[i] = data[j:k:k]
   729  	}
   730  	return values
   731  }
   732  
   733  func boundaryOrderOf(minOrder, maxOrder int) format.BoundaryOrder {
   734  	if minOrder == maxOrder {
   735  		switch {
   736  		case minOrder > 0:
   737  			return format.Ascending
   738  		case minOrder < 0:
   739  			return format.Descending
   740  		}
   741  	}
   742  	return format.Unordered
   743  }