github.com/parquet-go/parquet-go@v0.20.0/dictionary.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  	"math/bits"
     6  	"unsafe"
     7  
     8  	"github.com/parquet-go/parquet-go/deprecated"
     9  	"github.com/parquet-go/parquet-go/encoding"
    10  	"github.com/parquet-go/parquet-go/encoding/plain"
    11  	"github.com/parquet-go/parquet-go/hashprobe"
    12  	"github.com/parquet-go/parquet-go/internal/bitpack"
    13  	"github.com/parquet-go/parquet-go/internal/unsafecast"
    14  	"github.com/parquet-go/parquet-go/sparse"
    15  )
    16  
    17  const (
    18  	// Maximum load of probing tables. This parameter configures the balance
    19  	// between memory density and compute time of probing operations. Valid
    20  	// values are floating point numbers between 0 and 1.
    21  	//
    22  	// Smaller values result in lower collision probability when inserting
    23  	// values in probing tables, but also increase memory utilization.
    24  	//
    25  	// TODO: make this configurable by the application?
    26  	hashprobeTableMaxLoad = 0.85
    27  
    28  	// An estimate of the CPU cache footprint used by insert operations.
    29  	//
    30  	// This constant is used to determine a useful chunk size depending on the
    31  	// size of values being inserted in dictionaries. More values of small size
    32  	// can fit in CPU caches, so the inserts can operation on larger chunks.
    33  	insertsTargetCacheFootprint = 8192
    34  )
    35  
    36  // The Dictionary interface represents type-specific implementations of parquet
    37  // dictionaries.
    38  //
    39  // Programs can instantiate dictionaries by call the NewDictionary method of a
    40  // Type object.
    41  //
    42  // The current implementation has a limitation which prevents applications from
    43  // providing custom versions of this interface because it contains unexported
    44  // methods. The only way to create Dictionary values is to call the
    45  // NewDictionary of Type instances. This limitation may be lifted in future
    46  // releases.
    47  type Dictionary interface {
    48  	// Returns the type that the dictionary was created from.
    49  	Type() Type
    50  
    51  	// Returns the number of value indexed in the dictionary.
    52  	Len() int
    53  
    54  	// Returns the dictionary value at the given index.
    55  	Index(index int32) Value
    56  
    57  	// Inserts values from the second slice to the dictionary and writes the
    58  	// indexes at which each value was inserted to the first slice.
    59  	//
    60  	// The method panics if the length of the indexes slice is smaller than the
    61  	// length of the values slice.
    62  	Insert(indexes []int32, values []Value)
    63  
    64  	// Given an array of dictionary indexes, lookup the values into the array
    65  	// of values passed as second argument.
    66  	//
    67  	// The method panics if len(indexes) > len(values), or one of the indexes
    68  	// is negative or greater than the highest index in the dictionary.
    69  	Lookup(indexes []int32, values []Value)
    70  
    71  	// Returns the min and max values found in the given indexes.
    72  	Bounds(indexes []int32) (min, max Value)
    73  
    74  	// Resets the dictionary to its initial state, removing all values.
    75  	Reset()
    76  
    77  	// Returns a Page representing the content of the dictionary.
    78  	//
    79  	// The returned page shares the underlying memory of the buffer, it remains
    80  	// valid to use until the dictionary's Reset method is called.
    81  	Page() Page
    82  
    83  	// See ColumnBuffer.writeValues for details on the use of unexported methods
    84  	// on interfaces.
    85  	insert(indexes []int32, rows sparse.Array)
    86  	//lookup(indexes []int32, rows sparse.Array)
    87  }
    88  
    89  func checkLookupIndexBounds(indexes []int32, rows sparse.Array) {
    90  	if rows.Len() < len(indexes) {
    91  		panic("dictionary lookup with more indexes than values")
    92  	}
    93  }
    94  
    95  // The boolean dictionary always contains two values for true and false.
    96  type booleanDictionary struct {
    97  	booleanPage
    98  	// There are only two possible values for booleans, false and true.
    99  	// Rather than using a Go map, we track the indexes of each values
   100  	// in an array of two 32 bits integers. When inserting values in the
   101  	// dictionary, we ensure that an index exist for each boolean value,
   102  	// then use the value 0 or 1 (false or true) to perform a lookup in
   103  	// the dictionary's map.
   104  	table [2]int32
   105  }
   106  
   107  func newBooleanDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *booleanDictionary {
   108  	indexOfFalse, indexOfTrue, values := int32(-1), int32(-1), data.Boolean()
   109  
   110  	for i := int32(0); i < numValues && indexOfFalse < 0 && indexOfTrue < 0; i += 8 {
   111  		v := values[i]
   112  		if v != 0x00 {
   113  			indexOfTrue = i + int32(bits.TrailingZeros8(v))
   114  		}
   115  		if v != 0xFF {
   116  			indexOfFalse = i + int32(bits.TrailingZeros8(^v))
   117  		}
   118  	}
   119  
   120  	return &booleanDictionary{
   121  		booleanPage: booleanPage{
   122  			typ:         typ,
   123  			bits:        values[:bitpack.ByteCount(uint(numValues))],
   124  			numValues:   numValues,
   125  			columnIndex: ^columnIndex,
   126  		},
   127  		table: [2]int32{
   128  			0: indexOfFalse,
   129  			1: indexOfTrue,
   130  		},
   131  	}
   132  }
   133  
   134  func (d *booleanDictionary) Type() Type { return newIndexedType(d.typ, d) }
   135  
   136  func (d *booleanDictionary) Len() int { return int(d.numValues) }
   137  
   138  func (d *booleanDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   139  
   140  func (d *booleanDictionary) index(i int32) bool { return d.valueAt(int(i)) }
   141  
   142  func (d *booleanDictionary) Insert(indexes []int32, values []Value) {
   143  	model := Value{}
   144  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   145  }
   146  
   147  func (d *booleanDictionary) insert(indexes []int32, rows sparse.Array) {
   148  	_ = indexes[:rows.Len()]
   149  
   150  	if d.table[0] < 0 {
   151  		d.table[0] = d.numValues
   152  		d.numValues++
   153  		d.bits = plain.AppendBoolean(d.bits, int(d.table[0]), false)
   154  	}
   155  
   156  	if d.table[1] < 0 {
   157  		d.table[1] = d.numValues
   158  		d.numValues++
   159  		d.bits = plain.AppendBoolean(d.bits, int(d.table[1]), true)
   160  	}
   161  
   162  	values := rows.Uint8Array()
   163  	dict := d.table
   164  
   165  	for i := 0; i < rows.Len(); i++ {
   166  		v := values.Index(i) & 1
   167  		indexes[i] = dict[v]
   168  	}
   169  }
   170  
   171  func (d *booleanDictionary) Lookup(indexes []int32, values []Value) {
   172  	model := d.makeValue(false)
   173  	memsetValues(values, model)
   174  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   175  }
   176  
   177  func (d *booleanDictionary) lookup(indexes []int32, rows sparse.Array) {
   178  	checkLookupIndexBounds(indexes, rows)
   179  	for i, j := range indexes {
   180  		*(*bool)(rows.Index(i)) = d.index(j)
   181  	}
   182  }
   183  
   184  func (d *booleanDictionary) Bounds(indexes []int32) (min, max Value) {
   185  	if len(indexes) > 0 {
   186  		hasFalse, hasTrue := false, false
   187  
   188  		for _, i := range indexes {
   189  			v := d.index(i)
   190  			if v {
   191  				hasTrue = true
   192  			} else {
   193  				hasFalse = true
   194  			}
   195  			if hasTrue && hasFalse {
   196  				break
   197  			}
   198  		}
   199  
   200  		min = d.makeValue(!hasFalse)
   201  		max = d.makeValue(hasTrue)
   202  	}
   203  	return min, max
   204  }
   205  
   206  func (d *booleanDictionary) Reset() {
   207  	d.bits = d.bits[:0]
   208  	d.offset = 0
   209  	d.numValues = 0
   210  	d.table = [2]int32{-1, -1}
   211  }
   212  
   213  func (d *booleanDictionary) Page() Page {
   214  	return &d.booleanPage
   215  }
   216  
   217  type int32Dictionary struct {
   218  	int32Page
   219  	table *hashprobe.Int32Table
   220  }
   221  
   222  func newInt32Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *int32Dictionary {
   223  	return &int32Dictionary{
   224  		int32Page: int32Page{
   225  			typ:         typ,
   226  			values:      data.Int32()[:numValues],
   227  			columnIndex: ^columnIndex,
   228  		},
   229  	}
   230  }
   231  
   232  func (d *int32Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   233  
   234  func (d *int32Dictionary) Len() int { return len(d.values) }
   235  
   236  func (d *int32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   237  
   238  func (d *int32Dictionary) index(i int32) int32 { return d.values[i] }
   239  
   240  func (d *int32Dictionary) Insert(indexes []int32, values []Value) {
   241  	model := Value{}
   242  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   243  }
   244  
   245  func (d *int32Dictionary) init(indexes []int32) {
   246  	d.table = hashprobe.NewInt32Table(len(d.values), hashprobeTableMaxLoad)
   247  
   248  	n := min(len(d.values), len(indexes))
   249  
   250  	for i := 0; i < len(d.values); i += n {
   251  		j := min(i+n, len(d.values))
   252  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   253  	}
   254  }
   255  
   256  func (d *int32Dictionary) insert(indexes []int32, rows sparse.Array) {
   257  	// Iterating over the input in chunks helps keep relevant data in CPU
   258  	// caches when a large number of values are inserted into the dictionary with
   259  	// a single method call.
   260  	//
   261  	// Without this chunking, memory areas from the head of the indexes and
   262  	// values arrays end up being evicted from CPU caches as the probing
   263  	// operation iterates through the array. The subsequent scan of the indexes
   264  	// required to determine which values must be inserted into the page then
   265  	// stalls on retrieving data from main memory.
   266  	//
   267  	// We measured as much as ~37% drop in throughput when disabling the
   268  	// chunking, and did not observe any penalties from having it on smaller
   269  	// inserts.
   270  	const chunkSize = insertsTargetCacheFootprint / 4
   271  
   272  	if d.table == nil {
   273  		d.init(indexes)
   274  	}
   275  
   276  	values := rows.Int32Array()
   277  
   278  	for i := 0; i < values.Len(); i += chunkSize {
   279  		j := min(i+chunkSize, values.Len())
   280  
   281  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   282  			for k, index := range indexes[i:j] {
   283  				if index == int32(len(d.values)) {
   284  					d.values = append(d.values, values.Index(i+k))
   285  				}
   286  			}
   287  		}
   288  	}
   289  }
   290  
   291  func (d *int32Dictionary) Lookup(indexes []int32, values []Value) {
   292  	model := d.makeValue(0)
   293  	memsetValues(values, model)
   294  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   295  }
   296  
   297  func (d *int32Dictionary) Bounds(indexes []int32) (min, max Value) {
   298  	if len(indexes) > 0 {
   299  		minValue, maxValue := d.bounds(indexes)
   300  		min = d.makeValue(minValue)
   301  		max = d.makeValue(maxValue)
   302  	}
   303  	return min, max
   304  }
   305  
   306  func (d *int32Dictionary) Reset() {
   307  	d.values = d.values[:0]
   308  	if d.table != nil {
   309  		d.table.Reset()
   310  	}
   311  }
   312  
   313  func (d *int32Dictionary) Page() Page {
   314  	return &d.int32Page
   315  }
   316  
   317  type int64Dictionary struct {
   318  	int64Page
   319  	table *hashprobe.Int64Table
   320  }
   321  
   322  func newInt64Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *int64Dictionary {
   323  	return &int64Dictionary{
   324  		int64Page: int64Page{
   325  			typ:         typ,
   326  			values:      data.Int64()[:numValues],
   327  			columnIndex: ^columnIndex,
   328  		},
   329  	}
   330  }
   331  
   332  func (d *int64Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   333  
   334  func (d *int64Dictionary) Len() int { return len(d.values) }
   335  
   336  func (d *int64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   337  
   338  func (d *int64Dictionary) index(i int32) int64 { return d.values[i] }
   339  
   340  func (d *int64Dictionary) Insert(indexes []int32, values []Value) {
   341  	model := Value{}
   342  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   343  }
   344  
   345  func (d *int64Dictionary) init(indexes []int32) {
   346  	d.table = hashprobe.NewInt64Table(len(d.values), hashprobeTableMaxLoad)
   347  
   348  	n := min(len(d.values), len(indexes))
   349  
   350  	for i := 0; i < len(d.values); i += n {
   351  		j := min(i+n, len(d.values))
   352  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   353  	}
   354  }
   355  
   356  func (d *int64Dictionary) insert(indexes []int32, rows sparse.Array) {
   357  	const chunkSize = insertsTargetCacheFootprint / 8
   358  
   359  	if d.table == nil {
   360  		d.init(indexes)
   361  	}
   362  
   363  	values := rows.Int64Array()
   364  
   365  	for i := 0; i < values.Len(); i += chunkSize {
   366  		j := min(i+chunkSize, values.Len())
   367  
   368  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   369  			for k, index := range indexes[i:j] {
   370  				if index == int32(len(d.values)) {
   371  					d.values = append(d.values, values.Index(i+k))
   372  				}
   373  			}
   374  		}
   375  	}
   376  }
   377  
   378  func (d *int64Dictionary) Lookup(indexes []int32, values []Value) {
   379  	model := d.makeValue(0)
   380  	memsetValues(values, model)
   381  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   382  }
   383  
   384  func (d *int64Dictionary) Bounds(indexes []int32) (min, max Value) {
   385  	if len(indexes) > 0 {
   386  		minValue, maxValue := d.bounds(indexes)
   387  		min = d.makeValue(minValue)
   388  		max = d.makeValue(maxValue)
   389  	}
   390  	return min, max
   391  }
   392  
   393  func (d *int64Dictionary) Reset() {
   394  	d.values = d.values[:0]
   395  	if d.table != nil {
   396  		d.table.Reset()
   397  	}
   398  }
   399  
   400  func (d *int64Dictionary) Page() Page {
   401  	return &d.int64Page
   402  }
   403  
   404  type int96Dictionary struct {
   405  	int96Page
   406  	hashmap map[deprecated.Int96]int32
   407  }
   408  
   409  func newInt96Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *int96Dictionary {
   410  	return &int96Dictionary{
   411  		int96Page: int96Page{
   412  			typ:         typ,
   413  			values:      data.Int96()[:numValues],
   414  			columnIndex: ^columnIndex,
   415  		},
   416  	}
   417  }
   418  
   419  func (d *int96Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   420  
   421  func (d *int96Dictionary) Len() int { return len(d.values) }
   422  
   423  func (d *int96Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   424  
   425  func (d *int96Dictionary) index(i int32) deprecated.Int96 { return d.values[i] }
   426  
   427  func (d *int96Dictionary) Insert(indexes []int32, values []Value) {
   428  	d.insertValues(indexes, len(values), func(i int) deprecated.Int96 {
   429  		return values[i].Int96()
   430  	})
   431  }
   432  
   433  func (d *int96Dictionary) insert(indexes []int32, rows sparse.Array) {
   434  	d.insertValues(indexes, rows.Len(), func(i int) deprecated.Int96 {
   435  		return *(*deprecated.Int96)(rows.Index(i))
   436  	})
   437  }
   438  
   439  func (d *int96Dictionary) insertValues(indexes []int32, count int, valueAt func(int) deprecated.Int96) {
   440  	_ = indexes[:count]
   441  
   442  	if d.hashmap == nil {
   443  		d.hashmap = make(map[deprecated.Int96]int32, len(d.values))
   444  		for i, v := range d.values {
   445  			d.hashmap[v] = int32(i)
   446  		}
   447  	}
   448  
   449  	for i := 0; i < count; i++ {
   450  		value := valueAt(i)
   451  
   452  		index, exists := d.hashmap[value]
   453  		if !exists {
   454  			index = int32(len(d.values))
   455  			d.values = append(d.values, value)
   456  			d.hashmap[value] = index
   457  		}
   458  
   459  		indexes[i] = index
   460  	}
   461  }
   462  
   463  func (d *int96Dictionary) Lookup(indexes []int32, values []Value) {
   464  	for i, j := range indexes {
   465  		values[i] = d.Index(j)
   466  	}
   467  }
   468  
   469  func (d *int96Dictionary) Bounds(indexes []int32) (min, max Value) {
   470  	if len(indexes) > 0 {
   471  		minValue := d.index(indexes[0])
   472  		maxValue := minValue
   473  
   474  		for _, i := range indexes[1:] {
   475  			value := d.index(i)
   476  			switch {
   477  			case value.Less(minValue):
   478  				minValue = value
   479  			case maxValue.Less(value):
   480  				maxValue = value
   481  			}
   482  		}
   483  
   484  		min = d.makeValue(minValue)
   485  		max = d.makeValue(maxValue)
   486  	}
   487  	return min, max
   488  }
   489  
   490  func (d *int96Dictionary) Reset() {
   491  	d.values = d.values[:0]
   492  	d.hashmap = nil
   493  }
   494  
   495  func (d *int96Dictionary) Page() Page {
   496  	return &d.int96Page
   497  }
   498  
   499  type floatDictionary struct {
   500  	floatPage
   501  	table *hashprobe.Float32Table
   502  }
   503  
   504  func newFloatDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *floatDictionary {
   505  	return &floatDictionary{
   506  		floatPage: floatPage{
   507  			typ:         typ,
   508  			values:      data.Float()[:numValues],
   509  			columnIndex: ^columnIndex,
   510  		},
   511  	}
   512  }
   513  
   514  func (d *floatDictionary) Type() Type { return newIndexedType(d.typ, d) }
   515  
   516  func (d *floatDictionary) Len() int { return len(d.values) }
   517  
   518  func (d *floatDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   519  
   520  func (d *floatDictionary) index(i int32) float32 { return d.values[i] }
   521  
   522  func (d *floatDictionary) Insert(indexes []int32, values []Value) {
   523  	model := Value{}
   524  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   525  }
   526  
   527  func (d *floatDictionary) init(indexes []int32) {
   528  	d.table = hashprobe.NewFloat32Table(len(d.values), hashprobeTableMaxLoad)
   529  
   530  	n := min(len(d.values), len(indexes))
   531  
   532  	for i := 0; i < len(d.values); i += n {
   533  		j := min(i+n, len(d.values))
   534  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   535  	}
   536  }
   537  
   538  func (d *floatDictionary) insert(indexes []int32, rows sparse.Array) {
   539  	const chunkSize = insertsTargetCacheFootprint / 4
   540  
   541  	if d.table == nil {
   542  		d.init(indexes)
   543  	}
   544  
   545  	values := rows.Float32Array()
   546  
   547  	for i := 0; i < values.Len(); i += chunkSize {
   548  		j := min(i+chunkSize, values.Len())
   549  
   550  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   551  			for k, index := range indexes[i:j] {
   552  				if index == int32(len(d.values)) {
   553  					d.values = append(d.values, values.Index(i+k))
   554  				}
   555  			}
   556  		}
   557  	}
   558  }
   559  
   560  func (d *floatDictionary) Lookup(indexes []int32, values []Value) {
   561  	model := d.makeValue(0)
   562  	memsetValues(values, model)
   563  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   564  }
   565  
   566  func (d *floatDictionary) Bounds(indexes []int32) (min, max Value) {
   567  	if len(indexes) > 0 {
   568  		minValue, maxValue := d.bounds(indexes)
   569  		min = d.makeValue(minValue)
   570  		max = d.makeValue(maxValue)
   571  	}
   572  	return min, max
   573  }
   574  
   575  func (d *floatDictionary) Reset() {
   576  	d.values = d.values[:0]
   577  	if d.table != nil {
   578  		d.table.Reset()
   579  	}
   580  }
   581  
   582  func (d *floatDictionary) Page() Page {
   583  	return &d.floatPage
   584  }
   585  
   586  type doubleDictionary struct {
   587  	doublePage
   588  	table *hashprobe.Float64Table
   589  }
   590  
   591  func newDoubleDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *doubleDictionary {
   592  	return &doubleDictionary{
   593  		doublePage: doublePage{
   594  			typ:         typ,
   595  			values:      data.Double()[:numValues],
   596  			columnIndex: ^columnIndex,
   597  		},
   598  	}
   599  }
   600  
   601  func (d *doubleDictionary) Type() Type { return newIndexedType(d.typ, d) }
   602  
   603  func (d *doubleDictionary) Len() int { return len(d.values) }
   604  
   605  func (d *doubleDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   606  
   607  func (d *doubleDictionary) index(i int32) float64 { return d.values[i] }
   608  
   609  func (d *doubleDictionary) Insert(indexes []int32, values []Value) {
   610  	model := Value{}
   611  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   612  }
   613  
   614  func (d *doubleDictionary) init(indexes []int32) {
   615  	d.table = hashprobe.NewFloat64Table(len(d.values), hashprobeTableMaxLoad)
   616  
   617  	n := min(len(d.values), len(indexes))
   618  
   619  	for i := 0; i < len(d.values); i += n {
   620  		j := min(i+n, len(d.values))
   621  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   622  	}
   623  }
   624  
   625  func (d *doubleDictionary) insert(indexes []int32, rows sparse.Array) {
   626  	const chunkSize = insertsTargetCacheFootprint / 8
   627  
   628  	if d.table == nil {
   629  		d.init(indexes)
   630  	}
   631  
   632  	values := rows.Float64Array()
   633  
   634  	for i := 0; i < values.Len(); i += chunkSize {
   635  		j := min(i+chunkSize, values.Len())
   636  
   637  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   638  			for k, index := range indexes[i:j] {
   639  				if index == int32(len(d.values)) {
   640  					d.values = append(d.values, values.Index(i+k))
   641  				}
   642  			}
   643  		}
   644  	}
   645  }
   646  
   647  func (d *doubleDictionary) Lookup(indexes []int32, values []Value) {
   648  	model := d.makeValue(0)
   649  	memsetValues(values, model)
   650  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   651  }
   652  
   653  func (d *doubleDictionary) Bounds(indexes []int32) (min, max Value) {
   654  	if len(indexes) > 0 {
   655  		minValue, maxValue := d.bounds(indexes)
   656  		min = d.makeValue(minValue)
   657  		max = d.makeValue(maxValue)
   658  	}
   659  	return min, max
   660  }
   661  
   662  func (d *doubleDictionary) Reset() {
   663  	d.values = d.values[:0]
   664  	if d.table != nil {
   665  		d.table.Reset()
   666  	}
   667  }
   668  
   669  func (d *doubleDictionary) Page() Page {
   670  	return &d.doublePage
   671  }
   672  
   673  type byteArrayDictionary struct {
   674  	byteArrayPage
   675  	table map[string]int32
   676  	alloc allocator
   677  }
   678  
   679  func newByteArrayDictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *byteArrayDictionary {
   680  	values, offsets := data.ByteArray()
   681  	// The first offset must always be zero, and the last offset is the length
   682  	// of the values in bytes.
   683  	//
   684  	// As an optimization we make the assumption that the backing array of the
   685  	// offsets slice belongs to the dictionary.
   686  	switch {
   687  	case cap(offsets) == 0:
   688  		offsets = make([]uint32, 1, 8)
   689  	case len(offsets) == 0:
   690  		offsets = append(offsets[:0], 0)
   691  	}
   692  	return &byteArrayDictionary{
   693  		byteArrayPage: byteArrayPage{
   694  			typ:         typ,
   695  			values:      values,
   696  			offsets:     offsets,
   697  			columnIndex: ^columnIndex,
   698  		},
   699  	}
   700  }
   701  
   702  func (d *byteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) }
   703  
   704  func (d *byteArrayDictionary) Len() int { return d.len() }
   705  
   706  func (d *byteArrayDictionary) Index(i int32) Value { return d.makeValueBytes(d.index(int(i))) }
   707  
   708  func (d *byteArrayDictionary) Insert(indexes []int32, values []Value) {
   709  	model := Value{}
   710  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
   711  }
   712  
   713  func (d *byteArrayDictionary) init() {
   714  	numValues := d.len()
   715  	d.table = make(map[string]int32, numValues)
   716  
   717  	for i := 0; i < numValues; i++ {
   718  		d.table[string(d.index(i))] = int32(len(d.table))
   719  	}
   720  }
   721  
   722  func (d *byteArrayDictionary) insert(indexes []int32, rows sparse.Array) {
   723  	if d.table == nil {
   724  		d.init()
   725  	}
   726  
   727  	values := rows.StringArray()
   728  
   729  	for i := range indexes {
   730  		value := values.Index(i)
   731  
   732  		index, exists := d.table[value]
   733  		if !exists {
   734  			value = d.alloc.copyString(value)
   735  			index = int32(len(d.table))
   736  			d.table[value] = index
   737  			d.values = append(d.values, value...)
   738  			d.offsets = append(d.offsets, uint32(len(d.values)))
   739  		}
   740  
   741  		indexes[i] = index
   742  	}
   743  }
   744  
   745  func (d *byteArrayDictionary) Lookup(indexes []int32, values []Value) {
   746  	model := d.makeValueString("")
   747  	memsetValues(values, model)
   748  	d.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
   749  }
   750  
   751  func (d *byteArrayDictionary) Bounds(indexes []int32) (min, max Value) {
   752  	if len(indexes) > 0 {
   753  		base := d.index(int(indexes[0]))
   754  		minValue := unsafecast.BytesToString(base)
   755  		maxValue := minValue
   756  		values := [64]string{}
   757  
   758  		for i := 1; i < len(indexes); i += len(values) {
   759  			n := len(indexes) - i
   760  			if n > len(values) {
   761  				n = len(values)
   762  			}
   763  			j := i + n
   764  			d.lookupString(indexes[i:j:j], makeArrayString(values[:n:n]))
   765  
   766  			for _, value := range values[:n:n] {
   767  				switch {
   768  				case value < minValue:
   769  					minValue = value
   770  				case value > maxValue:
   771  					maxValue = value
   772  				}
   773  			}
   774  		}
   775  
   776  		min = d.makeValueString(minValue)
   777  		max = d.makeValueString(maxValue)
   778  	}
   779  	return min, max
   780  }
   781  
   782  func (d *byteArrayDictionary) Reset() {
   783  	d.offsets = d.offsets[:1]
   784  	d.values = d.values[:0]
   785  	for k := range d.table {
   786  		delete(d.table, k)
   787  	}
   788  	d.alloc.reset()
   789  }
   790  
   791  func (d *byteArrayDictionary) Page() Page {
   792  	return &d.byteArrayPage
   793  }
   794  
   795  type fixedLenByteArrayDictionary struct {
   796  	fixedLenByteArrayPage
   797  	hashmap map[string]int32
   798  }
   799  
   800  func newFixedLenByteArrayDictionary(typ Type, columnIndex int16, numValues int32, values encoding.Values) *fixedLenByteArrayDictionary {
   801  	data, size := values.FixedLenByteArray()
   802  	return &fixedLenByteArrayDictionary{
   803  		fixedLenByteArrayPage: fixedLenByteArrayPage{
   804  			typ:         typ,
   805  			size:        size,
   806  			data:        data,
   807  			columnIndex: ^columnIndex,
   808  		},
   809  	}
   810  }
   811  
   812  func (d *fixedLenByteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) }
   813  
   814  func (d *fixedLenByteArrayDictionary) Len() int { return len(d.data) / d.size }
   815  
   816  func (d *fixedLenByteArrayDictionary) Index(i int32) Value {
   817  	return d.makeValueBytes(d.index(i))
   818  }
   819  
   820  func (d *fixedLenByteArrayDictionary) index(i int32) []byte {
   821  	j := (int(i) + 0) * d.size
   822  	k := (int(i) + 1) * d.size
   823  	return d.data[j:k:k]
   824  }
   825  
   826  func (d *fixedLenByteArrayDictionary) Insert(indexes []int32, values []Value) {
   827  	d.insertValues(indexes, len(values), func(i int) *byte {
   828  		return values[i].ptr
   829  	})
   830  }
   831  
   832  func (d *fixedLenByteArrayDictionary) insert(indexes []int32, rows sparse.Array) {
   833  	d.insertValues(indexes, rows.Len(), func(i int) *byte {
   834  		return (*byte)(rows.Index(i))
   835  	})
   836  }
   837  
   838  func (d *fixedLenByteArrayDictionary) insertValues(indexes []int32, count int, valueAt func(int) *byte) {
   839  	_ = indexes[:count]
   840  
   841  	if d.hashmap == nil {
   842  		d.hashmap = make(map[string]int32, cap(d.data)/d.size)
   843  		for i, j := 0, int32(0); i < len(d.data); i += d.size {
   844  			d.hashmap[string(d.data[i:i+d.size])] = j
   845  			j++
   846  		}
   847  	}
   848  
   849  	for i := 0; i < count; i++ {
   850  		value := unsafe.Slice(valueAt(i), d.size)
   851  
   852  		index, exists := d.hashmap[string(value)]
   853  		if !exists {
   854  			index = int32(d.Len())
   855  			start := len(d.data)
   856  			d.data = append(d.data, value...)
   857  			d.hashmap[string(d.data[start:])] = index
   858  		}
   859  
   860  		indexes[i] = index
   861  	}
   862  }
   863  
   864  func (d *fixedLenByteArrayDictionary) Lookup(indexes []int32, values []Value) {
   865  	model := d.makeValueString("")
   866  	memsetValues(values, model)
   867  	d.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
   868  }
   869  
   870  func (d *fixedLenByteArrayDictionary) Bounds(indexes []int32) (min, max Value) {
   871  	if len(indexes) > 0 {
   872  		base := d.index(indexes[0])
   873  		minValue := unsafecast.BytesToString(base)
   874  		maxValue := minValue
   875  		values := [64]string{}
   876  
   877  		for i := 1; i < len(indexes); i += len(values) {
   878  			n := len(indexes) - i
   879  			if n > len(values) {
   880  				n = len(values)
   881  			}
   882  			j := i + n
   883  			d.lookupString(indexes[i:j:j], makeArrayString(values[:n:n]))
   884  
   885  			for _, value := range values[:n:n] {
   886  				switch {
   887  				case value < minValue:
   888  					minValue = value
   889  				case value > maxValue:
   890  					maxValue = value
   891  				}
   892  			}
   893  		}
   894  
   895  		min = d.makeValueString(minValue)
   896  		max = d.makeValueString(maxValue)
   897  	}
   898  	return min, max
   899  }
   900  
   901  func (d *fixedLenByteArrayDictionary) Reset() {
   902  	d.data = d.data[:0]
   903  	d.hashmap = nil
   904  }
   905  
   906  func (d *fixedLenByteArrayDictionary) Page() Page {
   907  	return &d.fixedLenByteArrayPage
   908  }
   909  
   910  type uint32Dictionary struct {
   911  	uint32Page
   912  	table *hashprobe.Uint32Table
   913  }
   914  
   915  func newUint32Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *uint32Dictionary {
   916  	return &uint32Dictionary{
   917  		uint32Page: uint32Page{
   918  			typ:         typ,
   919  			values:      data.Uint32()[:numValues],
   920  			columnIndex: ^columnIndex,
   921  		},
   922  	}
   923  }
   924  
   925  func (d *uint32Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   926  
   927  func (d *uint32Dictionary) Len() int { return len(d.values) }
   928  
   929  func (d *uint32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   930  
   931  func (d *uint32Dictionary) index(i int32) uint32 { return d.values[i] }
   932  
   933  func (d *uint32Dictionary) Insert(indexes []int32, values []Value) {
   934  	model := Value{}
   935  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   936  }
   937  
   938  func (d *uint32Dictionary) init(indexes []int32) {
   939  	d.table = hashprobe.NewUint32Table(len(d.values), hashprobeTableMaxLoad)
   940  
   941  	n := min(len(d.values), len(indexes))
   942  
   943  	for i := 0; i < len(d.values); i += n {
   944  		j := min(i+n, len(d.values))
   945  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   946  	}
   947  }
   948  
   949  func (d *uint32Dictionary) insert(indexes []int32, rows sparse.Array) {
   950  	const chunkSize = insertsTargetCacheFootprint / 4
   951  
   952  	if d.table == nil {
   953  		d.init(indexes)
   954  	}
   955  
   956  	values := rows.Uint32Array()
   957  
   958  	for i := 0; i < values.Len(); i += chunkSize {
   959  		j := min(i+chunkSize, values.Len())
   960  
   961  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   962  			for k, index := range indexes[i:j] {
   963  				if index == int32(len(d.values)) {
   964  					d.values = append(d.values, values.Index(i+k))
   965  				}
   966  			}
   967  		}
   968  	}
   969  }
   970  
   971  func (d *uint32Dictionary) Lookup(indexes []int32, values []Value) {
   972  	model := d.makeValue(0)
   973  	memsetValues(values, model)
   974  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   975  }
   976  
   977  func (d *uint32Dictionary) Bounds(indexes []int32) (min, max Value) {
   978  	if len(indexes) > 0 {
   979  		minValue, maxValue := d.bounds(indexes)
   980  		min = d.makeValue(minValue)
   981  		max = d.makeValue(maxValue)
   982  	}
   983  	return min, max
   984  }
   985  
   986  func (d *uint32Dictionary) Reset() {
   987  	d.values = d.values[:0]
   988  	if d.table != nil {
   989  		d.table.Reset()
   990  	}
   991  }
   992  
   993  func (d *uint32Dictionary) Page() Page {
   994  	return &d.uint32Page
   995  }
   996  
   997  type uint64Dictionary struct {
   998  	uint64Page
   999  	table *hashprobe.Uint64Table
  1000  }
  1001  
  1002  func newUint64Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *uint64Dictionary {
  1003  	return &uint64Dictionary{
  1004  		uint64Page: uint64Page{
  1005  			typ:         typ,
  1006  			values:      data.Uint64()[:numValues],
  1007  			columnIndex: ^columnIndex,
  1008  		},
  1009  	}
  1010  }
  1011  
  1012  func (d *uint64Dictionary) Type() Type { return newIndexedType(d.typ, d) }
  1013  
  1014  func (d *uint64Dictionary) Len() int { return len(d.values) }
  1015  
  1016  func (d *uint64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
  1017  
  1018  func (d *uint64Dictionary) index(i int32) uint64 { return d.values[i] }
  1019  
  1020  func (d *uint64Dictionary) Insert(indexes []int32, values []Value) {
  1021  	model := Value{}
  1022  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
  1023  }
  1024  
  1025  func (d *uint64Dictionary) init(indexes []int32) {
  1026  	d.table = hashprobe.NewUint64Table(len(d.values), hashprobeTableMaxLoad)
  1027  
  1028  	n := min(len(d.values), len(indexes))
  1029  
  1030  	for i := 0; i < len(d.values); i += n {
  1031  		j := min(i+n, len(d.values))
  1032  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
  1033  	}
  1034  }
  1035  
  1036  func (d *uint64Dictionary) insert(indexes []int32, rows sparse.Array) {
  1037  	const chunkSize = insertsTargetCacheFootprint / 8
  1038  
  1039  	if d.table == nil {
  1040  		d.init(indexes)
  1041  	}
  1042  
  1043  	values := rows.Uint64Array()
  1044  
  1045  	for i := 0; i < values.Len(); i += chunkSize {
  1046  		j := min(i+chunkSize, values.Len())
  1047  
  1048  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
  1049  			for k, index := range indexes[i:j] {
  1050  				if index == int32(len(d.values)) {
  1051  					d.values = append(d.values, values.Index(i+k))
  1052  				}
  1053  			}
  1054  		}
  1055  	}
  1056  }
  1057  
  1058  func (d *uint64Dictionary) Lookup(indexes []int32, values []Value) {
  1059  	model := d.makeValue(0)
  1060  	memsetValues(values, model)
  1061  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
  1062  }
  1063  
  1064  func (d *uint64Dictionary) Bounds(indexes []int32) (min, max Value) {
  1065  	if len(indexes) > 0 {
  1066  		minValue, maxValue := d.bounds(indexes)
  1067  		min = d.makeValue(minValue)
  1068  		max = d.makeValue(maxValue)
  1069  	}
  1070  	return min, max
  1071  }
  1072  
  1073  func (d *uint64Dictionary) Reset() {
  1074  	d.values = d.values[:0]
  1075  	if d.table != nil {
  1076  		d.table.Reset()
  1077  	}
  1078  }
  1079  
  1080  func (d *uint64Dictionary) Page() Page {
  1081  	return &d.uint64Page
  1082  }
  1083  
  1084  type be128Dictionary struct {
  1085  	be128Page
  1086  	table *hashprobe.Uint128Table
  1087  }
  1088  
  1089  func newBE128Dictionary(typ Type, columnIndex int16, numValues int32, data encoding.Values) *be128Dictionary {
  1090  	return &be128Dictionary{
  1091  		be128Page: be128Page{
  1092  			typ:         typ,
  1093  			values:      data.Uint128()[:numValues],
  1094  			columnIndex: ^columnIndex,
  1095  		},
  1096  	}
  1097  }
  1098  
  1099  func (d *be128Dictionary) Type() Type { return newIndexedType(d.typ, d) }
  1100  
  1101  func (d *be128Dictionary) Len() int { return len(d.values) }
  1102  
  1103  func (d *be128Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
  1104  
  1105  func (d *be128Dictionary) index(i int32) *[16]byte { return &d.values[i] }
  1106  
  1107  func (d *be128Dictionary) Insert(indexes []int32, values []Value) {
  1108  	_ = indexes[:len(values)]
  1109  
  1110  	for _, v := range values {
  1111  		if v.kind != ^int8(FixedLenByteArray) {
  1112  			panic("values inserted in BE128 dictionary must be of type BYTE_ARRAY")
  1113  		}
  1114  		if v.u64 != 16 {
  1115  			panic("values inserted in BE128 dictionary must be of length 16")
  1116  		}
  1117  	}
  1118  
  1119  	if d.table == nil {
  1120  		d.init(indexes)
  1121  	}
  1122  
  1123  	const chunkSize = insertsTargetCacheFootprint / 16
  1124  	var buffer [chunkSize][16]byte
  1125  
  1126  	for i := 0; i < len(values); i += chunkSize {
  1127  		j := min(chunkSize+i, len(values))
  1128  		n := min(chunkSize, len(values)-i)
  1129  
  1130  		probe := buffer[:n:n]
  1131  		writePointersBE128(probe, makeArrayValue(values[i:j], unsafe.Offsetof(values[i].ptr)))
  1132  
  1133  		if d.table.Probe(probe, indexes[i:j:j]) > 0 {
  1134  			for k, v := range probe {
  1135  				if indexes[i+k] == int32(len(d.values)) {
  1136  					d.values = append(d.values, v)
  1137  				}
  1138  			}
  1139  		}
  1140  	}
  1141  }
  1142  
  1143  func (d *be128Dictionary) init(indexes []int32) {
  1144  	d.table = hashprobe.NewUint128Table(len(d.values), 0.75)
  1145  
  1146  	n := min(len(d.values), len(indexes))
  1147  
  1148  	for i := 0; i < len(d.values); i += n {
  1149  		j := min(i+n, len(d.values))
  1150  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
  1151  	}
  1152  }
  1153  
  1154  func (d *be128Dictionary) insert(indexes []int32, rows sparse.Array) {
  1155  	const chunkSize = insertsTargetCacheFootprint / 16
  1156  
  1157  	if d.table == nil {
  1158  		d.init(indexes)
  1159  	}
  1160  
  1161  	values := rows.Uint128Array()
  1162  
  1163  	for i := 0; i < values.Len(); i += chunkSize {
  1164  		j := min(i+chunkSize, values.Len())
  1165  
  1166  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
  1167  			for k, index := range indexes[i:j] {
  1168  				if index == int32(len(d.values)) {
  1169  					d.values = append(d.values, values.Index(i+k))
  1170  				}
  1171  			}
  1172  		}
  1173  	}
  1174  }
  1175  
  1176  func (d *be128Dictionary) Lookup(indexes []int32, values []Value) {
  1177  	model := d.makeValueString("")
  1178  	memsetValues(values, model)
  1179  	d.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
  1180  }
  1181  
  1182  func (d *be128Dictionary) Bounds(indexes []int32) (min, max Value) {
  1183  	if len(indexes) > 0 {
  1184  		minValue, maxValue := d.bounds(indexes)
  1185  		min = d.makeValue(minValue)
  1186  		max = d.makeValue(maxValue)
  1187  	}
  1188  	return min, max
  1189  }
  1190  
  1191  func (d *be128Dictionary) Reset() {
  1192  	d.values = d.values[:0]
  1193  	if d.table != nil {
  1194  		d.table.Reset()
  1195  	}
  1196  }
  1197  
  1198  func (d *be128Dictionary) Page() Page {
  1199  	return &d.be128Page
  1200  }
  1201  
  1202  // indexedType is a wrapper around a Type value which overrides object
  1203  // constructors to use indexed versions referencing values in the dictionary
  1204  // instead of storing plain values.
  1205  type indexedType struct {
  1206  	Type
  1207  	dict Dictionary
  1208  }
  1209  
  1210  func newIndexedType(typ Type, dict Dictionary) *indexedType {
  1211  	return &indexedType{Type: typ, dict: dict}
  1212  }
  1213  
  1214  func (t *indexedType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {
  1215  	return newIndexedColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))
  1216  }
  1217  
  1218  func (t *indexedType) NewPage(columnIndex, numValues int, data encoding.Values) Page {
  1219  	return newIndexedPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)
  1220  }
  1221  
  1222  // indexedPage is an implementation of the Page interface which stores
  1223  // indexes instead of plain value. The indexes reference the values in a
  1224  // dictionary that the page was created for.
  1225  type indexedPage struct {
  1226  	typ         *indexedType
  1227  	values      []int32
  1228  	columnIndex int16
  1229  }
  1230  
  1231  func newIndexedPage(typ *indexedType, columnIndex int16, numValues int32, data encoding.Values) *indexedPage {
  1232  	// RLE encoded values that contain dictionary indexes in data pages are
  1233  	// sometimes truncated when they contain only zeros. We account for this
  1234  	// special case here and extend the values buffer if it is shorter than
  1235  	// needed to hold `numValues`.
  1236  	size := int(numValues)
  1237  	values := data.Int32()
  1238  
  1239  	if len(values) < size {
  1240  		if cap(values) < size {
  1241  			tmp := make([]int32, size)
  1242  			copy(tmp, values)
  1243  			values = tmp
  1244  		} else {
  1245  			clear := values[len(values) : len(values)+size]
  1246  			for i := range clear {
  1247  				clear[i] = 0
  1248  			}
  1249  		}
  1250  	}
  1251  
  1252  	return &indexedPage{
  1253  		typ:         typ,
  1254  		values:      values[:size],
  1255  		columnIndex: ^columnIndex,
  1256  	}
  1257  }
  1258  
  1259  func (page *indexedPage) Type() Type { return indexedPageType{page.typ} }
  1260  
  1261  func (page *indexedPage) Column() int { return int(^page.columnIndex) }
  1262  
  1263  func (page *indexedPage) Dictionary() Dictionary { return page.typ.dict }
  1264  
  1265  func (page *indexedPage) NumRows() int64 { return int64(len(page.values)) }
  1266  
  1267  func (page *indexedPage) NumValues() int64 { return int64(len(page.values)) }
  1268  
  1269  func (page *indexedPage) NumNulls() int64 { return 0 }
  1270  
  1271  func (page *indexedPage) Size() int64 { return 4 * int64(len(page.values)) }
  1272  
  1273  func (page *indexedPage) RepetitionLevels() []byte { return nil }
  1274  
  1275  func (page *indexedPage) DefinitionLevels() []byte { return nil }
  1276  
  1277  func (page *indexedPage) Data() encoding.Values { return encoding.Int32Values(page.values) }
  1278  
  1279  func (page *indexedPage) Values() ValueReader { return &indexedPageValues{page: page} }
  1280  
  1281  func (page *indexedPage) Bounds() (min, max Value, ok bool) {
  1282  	if ok = len(page.values) > 0; ok {
  1283  		min, max = page.typ.dict.Bounds(page.values)
  1284  		min.columnIndex = page.columnIndex
  1285  		max.columnIndex = page.columnIndex
  1286  	}
  1287  	return min, max, ok
  1288  }
  1289  
  1290  func (page *indexedPage) Slice(i, j int64) Page {
  1291  	return &indexedPage{
  1292  		typ:         page.typ,
  1293  		values:      page.values[i:j],
  1294  		columnIndex: page.columnIndex,
  1295  	}
  1296  }
  1297  
  1298  // indexedPageType is an adapter for the indexedType returned when accessing
  1299  // the type of an indexedPage value. It overrides the Encode/Decode methods to
  1300  // account for the fact that an indexed page is holding indexes of values into
  1301  // its dictionary instead of plain values.
  1302  type indexedPageType struct{ *indexedType }
  1303  
  1304  func (t indexedPageType) NewValues(values []byte, _ []uint32) encoding.Values {
  1305  	return encoding.Int32ValuesFromBytes(values)
  1306  }
  1307  
  1308  func (t indexedPageType) Encode(dst []byte, src encoding.Values, enc encoding.Encoding) ([]byte, error) {
  1309  	return encoding.EncodeInt32(dst, src, enc)
  1310  }
  1311  
  1312  func (t indexedPageType) Decode(dst encoding.Values, src []byte, enc encoding.Encoding) (encoding.Values, error) {
  1313  	return encoding.DecodeInt32(dst, src, enc)
  1314  }
  1315  
  1316  func (t indexedPageType) EstimateDecodeSize(numValues int, src []byte, enc encoding.Encoding) int {
  1317  	return Int32Type.EstimateDecodeSize(numValues, src, enc)
  1318  }
  1319  
  1320  type indexedPageValues struct {
  1321  	page   *indexedPage
  1322  	offset int
  1323  }
  1324  
  1325  func (r *indexedPageValues) ReadValues(values []Value) (n int, err error) {
  1326  	if n = len(r.page.values) - r.offset; n == 0 {
  1327  		return 0, io.EOF
  1328  	}
  1329  	if n > len(values) {
  1330  		n = len(values)
  1331  	}
  1332  	r.page.typ.dict.Lookup(r.page.values[r.offset:r.offset+n], values[:n])
  1333  	r.offset += n
  1334  	if r.offset == len(r.page.values) {
  1335  		err = io.EOF
  1336  	}
  1337  	return n, err
  1338  }
  1339  
  1340  // indexedColumnBuffer is an implementation of the ColumnBuffer interface which
  1341  // builds a page of indexes into a parent dictionary when values are written.
  1342  type indexedColumnBuffer struct{ indexedPage }
  1343  
  1344  func newIndexedColumnBuffer(typ *indexedType, columnIndex int16, numValues int32) *indexedColumnBuffer {
  1345  	return &indexedColumnBuffer{
  1346  		indexedPage: indexedPage{
  1347  			typ:         typ,
  1348  			values:      make([]int32, 0, numValues),
  1349  			columnIndex: ^columnIndex,
  1350  		},
  1351  	}
  1352  }
  1353  
  1354  func (col *indexedColumnBuffer) Clone() ColumnBuffer {
  1355  	return &indexedColumnBuffer{
  1356  		indexedPage: indexedPage{
  1357  			typ:         col.typ,
  1358  			values:      append([]int32{}, col.values...),
  1359  			columnIndex: col.columnIndex,
  1360  		},
  1361  	}
  1362  }
  1363  
  1364  func (col *indexedColumnBuffer) Type() Type { return col.typ.Type }
  1365  
  1366  func (col *indexedColumnBuffer) ColumnIndex() (ColumnIndex, error) {
  1367  	return indexedColumnIndex{col}, nil
  1368  }
  1369  
  1370  func (col *indexedColumnBuffer) OffsetIndex() (OffsetIndex, error) {
  1371  	return indexedOffsetIndex{col}, nil
  1372  }
  1373  
  1374  func (col *indexedColumnBuffer) BloomFilter() BloomFilter { return nil }
  1375  
  1376  func (col *indexedColumnBuffer) Dictionary() Dictionary { return col.typ.dict }
  1377  
  1378  func (col *indexedColumnBuffer) Pages() Pages { return onePage(col.Page()) }
  1379  
  1380  func (col *indexedColumnBuffer) Page() Page { return &col.indexedPage }
  1381  
  1382  func (col *indexedColumnBuffer) Reset() { col.values = col.values[:0] }
  1383  
  1384  func (col *indexedColumnBuffer) Cap() int { return cap(col.values) }
  1385  
  1386  func (col *indexedColumnBuffer) Len() int { return len(col.values) }
  1387  
  1388  func (col *indexedColumnBuffer) Less(i, j int) bool {
  1389  	u := col.typ.dict.Index(col.values[i])
  1390  	v := col.typ.dict.Index(col.values[j])
  1391  	return col.typ.Compare(u, v) < 0
  1392  }
  1393  
  1394  func (col *indexedColumnBuffer) Swap(i, j int) {
  1395  	col.values[i], col.values[j] = col.values[j], col.values[i]
  1396  }
  1397  
  1398  func (col *indexedColumnBuffer) WriteValues(values []Value) (int, error) {
  1399  	i := len(col.values)
  1400  	j := len(col.values) + len(values)
  1401  
  1402  	if j <= cap(col.values) {
  1403  		col.values = col.values[:j]
  1404  	} else {
  1405  		tmp := make([]int32, j, 2*j)
  1406  		copy(tmp, col.values)
  1407  		col.values = tmp
  1408  	}
  1409  
  1410  	col.typ.dict.Insert(col.values[i:], values)
  1411  	return len(values), nil
  1412  }
  1413  
  1414  func (col *indexedColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {
  1415  	i := len(col.values)
  1416  	j := len(col.values) + rows.Len()
  1417  
  1418  	if j <= cap(col.values) {
  1419  		col.values = col.values[:j]
  1420  	} else {
  1421  		tmp := make([]int32, j, 2*j)
  1422  		copy(tmp, col.values)
  1423  		col.values = tmp
  1424  	}
  1425  
  1426  	col.typ.dict.insert(col.values[i:], rows)
  1427  }
  1428  
  1429  func (col *indexedColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {
  1430  	i := int(offset)
  1431  	switch {
  1432  	case i < 0:
  1433  		return 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))
  1434  	case i >= len(col.values):
  1435  		return 0, io.EOF
  1436  	default:
  1437  		for n < len(values) && i < len(col.values) {
  1438  			values[n] = col.typ.dict.Index(col.values[i])
  1439  			values[n].columnIndex = col.columnIndex
  1440  			n++
  1441  			i++
  1442  		}
  1443  		if n < len(values) {
  1444  			err = io.EOF
  1445  		}
  1446  		return n, err
  1447  	}
  1448  }
  1449  
  1450  func (col *indexedColumnBuffer) ReadRowAt(row Row, index int64) (Row, error) {
  1451  	switch {
  1452  	case index < 0:
  1453  		return row, errRowIndexOutOfBounds(index, int64(len(col.values)))
  1454  	case index >= int64(len(col.values)):
  1455  		return row, io.EOF
  1456  	default:
  1457  		v := col.typ.dict.Index(col.values[index])
  1458  		v.columnIndex = col.columnIndex
  1459  		return append(row, v), nil
  1460  	}
  1461  }
  1462  
  1463  type indexedColumnIndex struct{ col *indexedColumnBuffer }
  1464  
  1465  func (index indexedColumnIndex) NumPages() int       { return 1 }
  1466  func (index indexedColumnIndex) NullCount(int) int64 { return 0 }
  1467  func (index indexedColumnIndex) NullPage(int) bool   { return false }
  1468  func (index indexedColumnIndex) MinValue(int) Value {
  1469  	min, _, _ := index.col.Bounds()
  1470  	return min
  1471  }
  1472  func (index indexedColumnIndex) MaxValue(int) Value {
  1473  	_, max, _ := index.col.Bounds()
  1474  	return max
  1475  }
  1476  func (index indexedColumnIndex) IsAscending() bool {
  1477  	min, max, _ := index.col.Bounds()
  1478  	return index.col.typ.Compare(min, max) <= 0
  1479  }
  1480  func (index indexedColumnIndex) IsDescending() bool {
  1481  	min, max, _ := index.col.Bounds()
  1482  	return index.col.typ.Compare(min, max) > 0
  1483  }
  1484  
  1485  type indexedOffsetIndex struct{ col *indexedColumnBuffer }
  1486  
  1487  func (index indexedOffsetIndex) NumPages() int                { return 1 }
  1488  func (index indexedOffsetIndex) Offset(int) int64             { return 0 }
  1489  func (index indexedOffsetIndex) CompressedPageSize(int) int64 { return index.col.Size() }
  1490  func (index indexedOffsetIndex) FirstRowIndex(int) int64      { return 0 }