github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary.go

github.com/vc42/parquet-go@v0.0.0-20240320194221-1a9adb5f23f5/dictionary.go (about)

     1  package parquet
     2  
     3  import (
     4  	"io"
     5  	"math/bits"
     6  	"unsafe"
     7  
     8  	"github.com/vc42/parquet-go/deprecated"
     9  	"github.com/vc42/parquet-go/encoding"
    10  	"github.com/vc42/parquet-go/encoding/plain"
    11  	"github.com/vc42/parquet-go/hashprobe"
    12  	"github.com/vc42/parquet-go/internal/bitpack"
    13  	"github.com/vc42/parquet-go/internal/unsafecast"
    14  	"github.com/vc42/parquet-go/sparse"
    15  )
    16  
    17  const (
    18  	// Maximum load of probing tables. This parameter configures the balance
    19  	// between memory density and compute time of probing operations. Valid
    20  	// values are floating point numbers between 0 and 1.
    21  	//
    22  	// Smaller values result in lower collision probability when inserting
    23  	// values in probing tables, but also increase memory utilization.
    24  	//
    25  	// TODO: make this configurable by the application?
    26  	hashprobeTableMaxLoad = 0.85
    27  
    28  	// An estimate of the CPU cache footprint used by insert operations.
    29  	//
    30  	// This constant is used to determine a useful chunk size depending on the
    31  	// size of values being inserted in dictionaries. More values of small size
    32  	// can fit in CPU caches, so the inserts can operation on larger chunks.
    33  	insertsTargetCacheFootprint = 8192
    34  )
    35  
    36  // The Dictionary interface represents type-specific implementations of parquet
    37  // dictionaries.
    38  //
    39  // Programs can instantiate dictionaries by call the NewDictionary method of a
    40  // Type object.
    41  //
    42  // The current implementation has a limitation which prevents applications from
    43  // providing custom versions of this interface because it contains unexported
    44  // methods. The only way to create Dictionary values is to call the
    45  // NewDictionary of Type instances. This limitation may be lifted in future
    46  // releases.
    47  type Dictionary interface {
    48  	// Returns the type that the dictionary was created from.
    49  	Type() Type
    50  
    51  	// Returns the number of value indexed in the dictionary.
    52  	Len() int
    53  
    54  	// Returns the dictionary value at the given index.
    55  	Index(index int32) Value
    56  
    57  	// Inserts values from the second slice to the dictionary and writes the
    58  	// indexes at which each value was inserted to the first slice.
    59  	//
    60  	// The method panics if the length of the indexes slice is smaller than the
    61  	// length of the values slice.
    62  	Insert(indexes []int32, values []Value)
    63  
    64  	// Given an array of dictionary indexes, lookup the values into the array
    65  	// of values passed as second argument.
    66  	//
    67  	// The method panics if len(indexes) > len(values), or one of the indexes
    68  	// is negative or greater than the highest index in the dictionary.
    69  	Lookup(indexes []int32, values []Value)
    70  
    71  	// Returns the min and max values found in the given indexes.
    72  	Bounds(indexes []int32) (min, max Value)
    73  
    74  	// Resets the dictionary to its initial state, removing all values.
    75  	Reset()
    76  
    77  	// Returns a BufferedPage representing the content of the dictionary.
    78  	//
    79  	// The returned page shares the underlying memory of the buffer, it remains
    80  	// valid to use until the dictionary's Reset method is called.
    81  	Page() BufferedPage
    82  
    83  	// See ColumnBuffer.writeValues for details on the use of unexported methods
    84  	// on interfaces.
    85  	insert(indexes []int32, rows sparse.Array)
    86  	//lookup(indexes []int32, rows sparse.Array)
    87  }
    88  
    89  func checkLookupIndexBounds(indexes []int32, rows sparse.Array) {
    90  	if rows.Len() < len(indexes) {
    91  		panic("dictionary lookup with more indexes than values")
    92  	}
    93  }
    94  
    95  // The boolean dictionary always contains two values for true and false.
    96  type booleanDictionary struct {
    97  	booleanPage
    98  	// There are only two possible values for booleans, false and true.
    99  	// Rather than using a Go map, we track the indexes of each values
   100  	// in an array of two 32 bits integers. When inserting values in the
   101  	// dictionary, we ensure that an index exist for each boolean value,
   102  	// then use the value 0 or 1 (false or true) to perform a lookup in
   103  	// the dictionary's map.
   104  	table [2]int32
   105  }
   106  
   107  func newBooleanDictionary(typ Type, columnIndex int16, numValues int32, values []byte) *booleanDictionary {
   108  	indexOfFalse, indexOfTrue := int32(-1), int32(-1)
   109  
   110  	for i := int32(0); i < numValues && indexOfFalse < 0 && indexOfTrue < 0; i += 8 {
   111  		v := values[i]
   112  		if v != 0x00 {
   113  			indexOfTrue = i + int32(bits.TrailingZeros8(v))
   114  		}
   115  		if v != 0xFF {
   116  			indexOfFalse = i + int32(bits.TrailingZeros8(^v))
   117  		}
   118  	}
   119  
   120  	return &booleanDictionary{
   121  		booleanPage: booleanPage{
   122  			typ:         typ,
   123  			bits:        values[:bitpack.ByteCount(uint(numValues))],
   124  			numValues:   numValues,
   125  			columnIndex: ^columnIndex,
   126  		},
   127  		table: [2]int32{
   128  			0: indexOfFalse,
   129  			1: indexOfTrue,
   130  		},
   131  	}
   132  }
   133  
   134  func (d *booleanDictionary) Type() Type { return newIndexedType(d.typ, d) }
   135  
   136  func (d *booleanDictionary) Len() int { return int(d.numValues) }
   137  
   138  func (d *booleanDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   139  
   140  func (d *booleanDictionary) index(i int32) bool { return d.valueAt(int(i)) }
   141  
   142  func (d *booleanDictionary) Insert(indexes []int32, values []Value) {
   143  	model := Value{}
   144  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   145  }
   146  
   147  func (d *booleanDictionary) insert(indexes []int32, rows sparse.Array) {
   148  	_ = indexes[:rows.Len()]
   149  
   150  	if d.table[0] < 0 {
   151  		d.table[0] = d.numValues
   152  		d.numValues++
   153  		d.bits = plain.AppendBoolean(d.bits, int(d.table[0]), false)
   154  	}
   155  
   156  	if d.table[1] < 0 {
   157  		d.table[1] = d.numValues
   158  		d.numValues++
   159  		d.bits = plain.AppendBoolean(d.bits, int(d.table[1]), true)
   160  	}
   161  
   162  	dict := d.table
   163  
   164  	for i := 0; i < rows.Len(); i++ {
   165  		v := *(*byte)(rows.Index(i)) & 1
   166  		indexes[i] = dict[v]
   167  	}
   168  }
   169  
   170  func (d *booleanDictionary) Lookup(indexes []int32, values []Value) {
   171  	model := d.makeValue(false)
   172  	memsetValues(values, model)
   173  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   174  }
   175  
   176  func (d *booleanDictionary) lookup(indexes []int32, rows sparse.Array) {
   177  	checkLookupIndexBounds(indexes, rows)
   178  	for i, j := range indexes {
   179  		*(*bool)(rows.Index(i)) = d.index(j)
   180  	}
   181  }
   182  
   183  func (d *booleanDictionary) Bounds(indexes []int32) (min, max Value) {
   184  	if len(indexes) > 0 {
   185  		hasFalse, hasTrue := false, false
   186  
   187  		for _, i := range indexes {
   188  			v := d.index(i)
   189  			if v {
   190  				hasTrue = true
   191  			} else {
   192  				hasFalse = true
   193  			}
   194  			if hasTrue && hasFalse {
   195  				break
   196  			}
   197  		}
   198  
   199  		min = d.makeValue(!hasFalse)
   200  		max = d.makeValue(hasTrue)
   201  	}
   202  	return min, max
   203  }
   204  
   205  func (d *booleanDictionary) Reset() {
   206  	d.bits = d.bits[:0]
   207  	d.offset = 0
   208  	d.numValues = 0
   209  	d.table = [2]int32{-1, -1}
   210  }
   211  
   212  func (d *booleanDictionary) Page() BufferedPage {
   213  	return &d.booleanPage
   214  }
   215  
   216  type int32Dictionary struct {
   217  	int32Page
   218  	table *hashprobe.Int32Table
   219  }
   220  
   221  func newInt32Dictionary(typ Type, columnIndex int16, numValues int32, values []byte) *int32Dictionary {
   222  	return &int32Dictionary{
   223  		int32Page: int32Page{
   224  			typ:         typ,
   225  			values:      unsafecast.BytesToInt32(values)[:numValues],
   226  			columnIndex: ^columnIndex,
   227  		},
   228  	}
   229  }
   230  
   231  func (d *int32Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   232  
   233  func (d *int32Dictionary) Len() int { return len(d.values) }
   234  
   235  func (d *int32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   236  
   237  func (d *int32Dictionary) index(i int32) int32 { return d.values[i] }
   238  
   239  func (d *int32Dictionary) Insert(indexes []int32, values []Value) {
   240  	model := Value{}
   241  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   242  }
   243  
   244  func (d *int32Dictionary) init(indexes []int32) {
   245  	d.table = hashprobe.NewInt32Table(cap(d.values), hashprobeTableMaxLoad)
   246  
   247  	n := min(len(d.values), len(indexes))
   248  
   249  	for i := 0; i < len(d.values); i += n {
   250  		j := min(i+n, len(d.values))
   251  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   252  	}
   253  }
   254  
   255  func (d *int32Dictionary) insert(indexes []int32, rows sparse.Array) {
   256  	// Iterating over the input in chunks helps keep relevant data in CPU
   257  	// caches when a large number of values are inserted into the dictionary with
   258  	// a single method call.
   259  	//
   260  	// Without this chunking, memory areas from the head of the indexes and
   261  	// values arrays end up being evicted from CPU caches as the probing
   262  	// operation iterates through the array. The subsequent scan of the indexes
   263  	// required to determine which values must be inserted into the page then
   264  	// stalls on retrieving data from main memory.
   265  	//
   266  	// We measured as much as ~37% drop in throughput when disabling the
   267  	// chunking, and did not observe any penalties from having it on smaller
   268  	// inserts.
   269  	const chunkSize = insertsTargetCacheFootprint / 4
   270  
   271  	if d.table == nil {
   272  		d.init(indexes)
   273  	}
   274  
   275  	values := rows.Int32Array()
   276  
   277  	for i := 0; i < values.Len(); i += chunkSize {
   278  		j := min(i+chunkSize, values.Len())
   279  
   280  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   281  			for k, index := range indexes[i:j] {
   282  				if index == int32(len(d.values)) {
   283  					d.values = append(d.values, values.Index(i+k))
   284  				}
   285  			}
   286  		}
   287  	}
   288  }
   289  
   290  func (d *int32Dictionary) Lookup(indexes []int32, values []Value) {
   291  	model := d.makeValue(0)
   292  	memsetValues(values, model)
   293  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   294  }
   295  
   296  func (d *int32Dictionary) Bounds(indexes []int32) (min, max Value) {
   297  	if len(indexes) > 0 {
   298  		minValue, maxValue := d.bounds(indexes)
   299  		min = d.makeValue(minValue)
   300  		max = d.makeValue(maxValue)
   301  	}
   302  	return min, max
   303  }
   304  
   305  func (d *int32Dictionary) Reset() {
   306  	d.values = d.values[:0]
   307  	if d.table != nil {
   308  		d.table.Reset()
   309  	}
   310  }
   311  
   312  func (d *int32Dictionary) Page() BufferedPage {
   313  	return &d.int32Page
   314  }
   315  
   316  type int64Dictionary struct {
   317  	int64Page
   318  	table *hashprobe.Int64Table
   319  }
   320  
   321  func newInt64Dictionary(typ Type, columnIndex int16, numValues int32, values []byte) *int64Dictionary {
   322  	return &int64Dictionary{
   323  		int64Page: int64Page{
   324  			typ:         typ,
   325  			values:      unsafecast.BytesToInt64(values)[:numValues],
   326  			columnIndex: ^columnIndex,
   327  		},
   328  	}
   329  }
   330  
   331  func (d *int64Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   332  
   333  func (d *int64Dictionary) Len() int { return len(d.values) }
   334  
   335  func (d *int64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   336  
   337  func (d *int64Dictionary) index(i int32) int64 { return d.values[i] }
   338  
   339  func (d *int64Dictionary) Insert(indexes []int32, values []Value) {
   340  	model := Value{}
   341  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   342  }
   343  
   344  func (d *int64Dictionary) init(indexes []int32) {
   345  	d.table = hashprobe.NewInt64Table(cap(d.values), hashprobeTableMaxLoad)
   346  
   347  	n := min(len(d.values), len(indexes))
   348  
   349  	for i := 0; i < len(d.values); i += n {
   350  		j := min(i+n, len(d.values))
   351  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   352  	}
   353  }
   354  
   355  func (d *int64Dictionary) insert(indexes []int32, rows sparse.Array) {
   356  	const chunkSize = insertsTargetCacheFootprint / 8
   357  
   358  	if d.table == nil {
   359  		d.init(indexes)
   360  	}
   361  
   362  	values := rows.Int64Array()
   363  
   364  	for i := 0; i < values.Len(); i += chunkSize {
   365  		j := min(i+chunkSize, values.Len())
   366  
   367  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   368  			for k, index := range indexes[i:j] {
   369  				if index == int32(len(d.values)) {
   370  					d.values = append(d.values, values.Index(i+k))
   371  				}
   372  			}
   373  		}
   374  	}
   375  }
   376  
   377  func (d *int64Dictionary) Lookup(indexes []int32, values []Value) {
   378  	model := d.makeValue(0)
   379  	memsetValues(values, model)
   380  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   381  }
   382  
   383  func (d *int64Dictionary) Bounds(indexes []int32) (min, max Value) {
   384  	if len(indexes) > 0 {
   385  		minValue, maxValue := d.bounds(indexes)
   386  		min = d.makeValue(minValue)
   387  		max = d.makeValue(maxValue)
   388  	}
   389  	return min, max
   390  }
   391  
   392  func (d *int64Dictionary) Reset() {
   393  	d.values = d.values[:0]
   394  	if d.table != nil {
   395  		d.table.Reset()
   396  	}
   397  }
   398  
   399  func (d *int64Dictionary) Page() BufferedPage {
   400  	return &d.int64Page
   401  }
   402  
   403  type int96Dictionary struct {
   404  	int96Page
   405  	hashmap map[deprecated.Int96]int32
   406  }
   407  
   408  func newInt96Dictionary(typ Type, columnIndex int16, numValues int32, values []byte) *int96Dictionary {
   409  	return &int96Dictionary{
   410  		int96Page: int96Page{
   411  			typ:         typ,
   412  			values:      deprecated.BytesToInt96(values)[:numValues],
   413  			columnIndex: ^columnIndex,
   414  		},
   415  	}
   416  }
   417  
   418  func (d *int96Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   419  
   420  func (d *int96Dictionary) Len() int { return len(d.values) }
   421  
   422  func (d *int96Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   423  
   424  func (d *int96Dictionary) index(i int32) deprecated.Int96 { return d.values[i] }
   425  
   426  func (d *int96Dictionary) Insert(indexes []int32, values []Value) {
   427  	d.insertValues(indexes, len(values), func(i int) deprecated.Int96 {
   428  		return values[i].Int96()
   429  	})
   430  }
   431  
   432  func (d *int96Dictionary) insert(indexes []int32, rows sparse.Array) {
   433  	d.insertValues(indexes, rows.Len(), func(i int) deprecated.Int96 {
   434  		return *(*deprecated.Int96)(rows.Index(i))
   435  	})
   436  }
   437  
   438  func (d *int96Dictionary) insertValues(indexes []int32, count int, valueAt func(int) deprecated.Int96) {
   439  	_ = indexes[:count]
   440  
   441  	if d.hashmap == nil {
   442  		d.hashmap = make(map[deprecated.Int96]int32, cap(d.values))
   443  		for i, v := range d.values {
   444  			d.hashmap[v] = int32(i)
   445  		}
   446  	}
   447  
   448  	for i := 0; i < count; i++ {
   449  		value := valueAt(i)
   450  
   451  		index, exists := d.hashmap[value]
   452  		if !exists {
   453  			index = int32(len(d.values))
   454  			d.values = append(d.values, value)
   455  			d.hashmap[value] = index
   456  		}
   457  
   458  		indexes[i] = index
   459  	}
   460  }
   461  
   462  func (d *int96Dictionary) Lookup(indexes []int32, values []Value) {
   463  	for i, j := range indexes {
   464  		values[i] = d.Index(j)
   465  	}
   466  }
   467  
   468  func (d *int96Dictionary) Bounds(indexes []int32) (min, max Value) {
   469  	if len(indexes) > 0 {
   470  		minValue := d.index(indexes[0])
   471  		maxValue := minValue
   472  
   473  		for _, i := range indexes[1:] {
   474  			value := d.index(i)
   475  			switch {
   476  			case value.Less(minValue):
   477  				minValue = value
   478  			case maxValue.Less(value):
   479  				maxValue = value
   480  			}
   481  		}
   482  
   483  		min = d.makeValue(minValue)
   484  		max = d.makeValue(maxValue)
   485  	}
   486  	return min, max
   487  }
   488  
   489  func (d *int96Dictionary) Reset() {
   490  	d.values = d.values[:0]
   491  	d.hashmap = nil
   492  }
   493  
   494  func (d *int96Dictionary) Page() BufferedPage {
   495  	return &d.int96Page
   496  }
   497  
   498  type floatDictionary struct {
   499  	floatPage
   500  	table *hashprobe.Float32Table
   501  }
   502  
   503  func newFloatDictionary(typ Type, columnIndex int16, numValues int32, values []byte) *floatDictionary {
   504  	return &floatDictionary{
   505  		floatPage: floatPage{
   506  			typ:         typ,
   507  			values:      unsafecast.BytesToFloat32(values)[:numValues],
   508  			columnIndex: ^columnIndex,
   509  		},
   510  	}
   511  }
   512  
   513  func (d *floatDictionary) Type() Type { return newIndexedType(d.typ, d) }
   514  
   515  func (d *floatDictionary) Len() int { return len(d.values) }
   516  
   517  func (d *floatDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   518  
   519  func (d *floatDictionary) index(i int32) float32 { return d.values[i] }
   520  
   521  func (d *floatDictionary) Insert(indexes []int32, values []Value) {
   522  	model := Value{}
   523  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   524  }
   525  
   526  func (d *floatDictionary) init(indexes []int32) {
   527  	d.table = hashprobe.NewFloat32Table(cap(d.values), hashprobeTableMaxLoad)
   528  
   529  	n := min(len(d.values), len(indexes))
   530  
   531  	for i := 0; i < len(d.values); i += n {
   532  		j := min(i+n, len(d.values))
   533  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   534  	}
   535  }
   536  
   537  func (d *floatDictionary) insert(indexes []int32, rows sparse.Array) {
   538  	const chunkSize = insertsTargetCacheFootprint / 4
   539  
   540  	if d.table == nil {
   541  		d.init(indexes)
   542  	}
   543  
   544  	values := rows.Float32Array()
   545  
   546  	for i := 0; i < values.Len(); i += chunkSize {
   547  		j := min(i+chunkSize, values.Len())
   548  
   549  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   550  			for k, index := range indexes[i:j] {
   551  				if index == int32(len(d.values)) {
   552  					d.values = append(d.values, values.Index(i+k))
   553  				}
   554  			}
   555  		}
   556  	}
   557  }
   558  
   559  func (d *floatDictionary) Lookup(indexes []int32, values []Value) {
   560  	model := d.makeValue(0)
   561  	memsetValues(values, model)
   562  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   563  }
   564  
   565  func (d *floatDictionary) Bounds(indexes []int32) (min, max Value) {
   566  	if len(indexes) > 0 {
   567  		minValue, maxValue := d.bounds(indexes)
   568  		min = d.makeValue(minValue)
   569  		max = d.makeValue(maxValue)
   570  	}
   571  	return min, max
   572  }
   573  
   574  func (d *floatDictionary) Reset() {
   575  	d.values = d.values[:0]
   576  	if d.table != nil {
   577  		d.table.Reset()
   578  	}
   579  }
   580  
   581  func (d *floatDictionary) Page() BufferedPage {
   582  	return &d.floatPage
   583  }
   584  
   585  type doubleDictionary struct {
   586  	doublePage
   587  	table *hashprobe.Float64Table
   588  }
   589  
   590  func newDoubleDictionary(typ Type, columnIndex int16, numValues int32, values []byte) *doubleDictionary {
   591  	return &doubleDictionary{
   592  		doublePage: doublePage{
   593  			typ:         typ,
   594  			values:      unsafecast.BytesToFloat64(values)[:numValues],
   595  			columnIndex: ^columnIndex,
   596  		},
   597  	}
   598  }
   599  
   600  func (d *doubleDictionary) Type() Type { return newIndexedType(d.typ, d) }
   601  
   602  func (d *doubleDictionary) Len() int { return len(d.values) }
   603  
   604  func (d *doubleDictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   605  
   606  func (d *doubleDictionary) index(i int32) float64 { return d.values[i] }
   607  
   608  func (d *doubleDictionary) Insert(indexes []int32, values []Value) {
   609  	model := Value{}
   610  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   611  }
   612  
   613  func (d *doubleDictionary) init(indexes []int32) {
   614  	d.table = hashprobe.NewFloat64Table(cap(d.values), hashprobeTableMaxLoad)
   615  
   616  	n := min(len(d.values), len(indexes))
   617  
   618  	for i := 0; i < len(d.values); i += n {
   619  		j := min(i+n, len(d.values))
   620  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   621  	}
   622  }
   623  
   624  func (d *doubleDictionary) insert(indexes []int32, rows sparse.Array) {
   625  	const chunkSize = insertsTargetCacheFootprint / 8
   626  
   627  	if d.table == nil {
   628  		d.init(indexes)
   629  	}
   630  
   631  	values := rows.Float64Array()
   632  
   633  	for i := 0; i < values.Len(); i += chunkSize {
   634  		j := min(i+chunkSize, values.Len())
   635  
   636  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   637  			for k, index := range indexes[i:j] {
   638  				if index == int32(len(d.values)) {
   639  					d.values = append(d.values, values.Index(i+k))
   640  				}
   641  			}
   642  		}
   643  	}
   644  }
   645  
   646  func (d *doubleDictionary) Lookup(indexes []int32, values []Value) {
   647  	model := d.makeValue(0)
   648  	memsetValues(values, model)
   649  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   650  }
   651  
   652  func (d *doubleDictionary) Bounds(indexes []int32) (min, max Value) {
   653  	if len(indexes) > 0 {
   654  		minValue, maxValue := d.bounds(indexes)
   655  		min = d.makeValue(minValue)
   656  		max = d.makeValue(maxValue)
   657  	}
   658  	return min, max
   659  }
   660  
   661  func (d *doubleDictionary) Reset() {
   662  	d.values = d.values[:0]
   663  	if d.table != nil {
   664  		d.table.Reset()
   665  	}
   666  }
   667  
   668  func (d *doubleDictionary) Page() BufferedPage {
   669  	return &d.doublePage
   670  }
   671  
   672  type byteArrayDictionary struct {
   673  	byteArrayPage
   674  	offsets []uint32
   675  	hashmap map[string]int32
   676  }
   677  
   678  func newByteArrayDictionary(typ Type, columnIndex int16, numValues int32, values []byte) *byteArrayDictionary {
   679  	d := &byteArrayDictionary{
   680  		offsets: make([]uint32, 0, numValues),
   681  		byteArrayPage: byteArrayPage{
   682  			typ:         typ,
   683  			values:      values,
   684  			numValues:   numValues,
   685  			columnIndex: ^columnIndex,
   686  		},
   687  	}
   688  
   689  	for i := 0; i < len(values); {
   690  		n := plain.ByteArrayLength(values[i:])
   691  		d.offsets = append(d.offsets, uint32(i))
   692  		i += plain.ByteArrayLengthSize
   693  		i += n
   694  	}
   695  
   696  	return d
   697  }
   698  
   699  func (d *byteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) }
   700  
   701  func (d *byteArrayDictionary) Len() int { return len(d.offsets) }
   702  
   703  func (d *byteArrayDictionary) Index(i int32) Value { return d.makeValueBytes(d.index(i)) }
   704  
   705  func (d *byteArrayDictionary) index(i int32) []byte { return d.valueAt(d.offsets[i]) }
   706  
   707  func (d *byteArrayDictionary) Insert(indexes []int32, values []Value) {
   708  	model := Value{}
   709  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
   710  }
   711  
   712  func (d *byteArrayDictionary) insert(indexes []int32, rows sparse.Array) {
   713  	_ = indexes[:rows.Len()]
   714  
   715  	if d.hashmap == nil {
   716  		d.hashmap = make(map[string]int32, cap(d.offsets))
   717  		for index, offset := range d.offsets {
   718  			d.hashmap[string(d.valueAt(offset))] = int32(index)
   719  		}
   720  	}
   721  
   722  	for i := 0; i < rows.Len(); i++ {
   723  		value := *(*string)(rows.Index(i))
   724  
   725  		index, exists := d.hashmap[value]
   726  		if !exists {
   727  			index = int32(len(d.offsets))
   728  			value = d.append(value)
   729  			d.hashmap[value] = index
   730  		}
   731  
   732  		indexes[i] = index
   733  	}
   734  }
   735  
   736  func (d *byteArrayDictionary) append(value string) string {
   737  	offset := len(d.values)
   738  	d.values = plain.AppendByteArrayString(d.values, value)
   739  	d.offsets = append(d.offsets, uint32(offset))
   740  	d.numValues++
   741  	return string(d.values[offset+plain.ByteArrayLengthSize : len(d.values)])
   742  }
   743  
   744  func (d *byteArrayDictionary) Lookup(indexes []int32, values []Value) {
   745  	model := d.makeValueString("")
   746  	memsetValues(values, model)
   747  	d.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
   748  }
   749  
   750  func (d *byteArrayDictionary) Bounds(indexes []int32) (min, max Value) {
   751  	if len(indexes) > 0 {
   752  		base := d.index(indexes[0])
   753  		minValue := unsafecast.BytesToString(base)
   754  		maxValue := minValue
   755  		values := [64]string{}
   756  
   757  		for i := 1; i < len(indexes); i += len(values) {
   758  			n := len(indexes) - i
   759  			if n > len(values) {
   760  				n = len(values)
   761  			}
   762  			j := i + n
   763  			d.lookupString(indexes[i:j:j], makeArrayString(values[:n:n]))
   764  
   765  			for _, value := range values[:n:n] {
   766  				switch {
   767  				case value < minValue:
   768  					minValue = value
   769  				case value > maxValue:
   770  					maxValue = value
   771  				}
   772  			}
   773  		}
   774  
   775  		min = d.makeValueString(minValue)
   776  		max = d.makeValueString(maxValue)
   777  	}
   778  	return min, max
   779  }
   780  
   781  func (d *byteArrayDictionary) Reset() {
   782  	d.offsets = d.offsets[:0]
   783  	d.values = d.values[:0]
   784  	d.numValues = 0
   785  	d.hashmap = nil
   786  }
   787  
   788  func (d *byteArrayDictionary) Page() BufferedPage {
   789  	return &d.byteArrayPage
   790  }
   791  
   792  type fixedLenByteArrayDictionary struct {
   793  	fixedLenByteArrayPage
   794  	hashmap map[string]int32
   795  }
   796  
   797  func newFixedLenByteArrayDictionary(typ Type, columnIndex int16, numValues int32, data []byte) *fixedLenByteArrayDictionary {
   798  	size := typ.Length()
   799  	return &fixedLenByteArrayDictionary{
   800  		fixedLenByteArrayPage: fixedLenByteArrayPage{
   801  			typ:         typ,
   802  			size:        size,
   803  			data:        data,
   804  			columnIndex: ^columnIndex,
   805  		},
   806  	}
   807  }
   808  
   809  func (d *fixedLenByteArrayDictionary) Type() Type { return newIndexedType(d.typ, d) }
   810  
   811  func (d *fixedLenByteArrayDictionary) Len() int { return len(d.data) / d.size }
   812  
   813  func (d *fixedLenByteArrayDictionary) Index(i int32) Value {
   814  	return d.makeValueBytes(d.index(i))
   815  }
   816  
   817  func (d *fixedLenByteArrayDictionary) index(i int32) []byte {
   818  	j := (int(i) + 0) * d.size
   819  	k := (int(i) + 1) * d.size
   820  	return d.data[j:k:k]
   821  }
   822  
   823  func (d *fixedLenByteArrayDictionary) Insert(indexes []int32, values []Value) {
   824  	d.insertValues(indexes, len(values), func(i int) *byte {
   825  		return values[i].ptr
   826  	})
   827  }
   828  
   829  func (d *fixedLenByteArrayDictionary) insert(indexes []int32, rows sparse.Array) {
   830  	d.insertValues(indexes, rows.Len(), func(i int) *byte {
   831  		return (*byte)(rows.Index(i))
   832  	})
   833  }
   834  
   835  func (d *fixedLenByteArrayDictionary) insertValues(indexes []int32, count int, valueAt func(int) *byte) {
   836  	_ = indexes[:count]
   837  
   838  	if d.hashmap == nil {
   839  		d.hashmap = make(map[string]int32, cap(d.data)/d.size)
   840  		for i, j := 0, int32(0); i < len(d.data); i += d.size {
   841  			d.hashmap[string(d.data[i:i+d.size])] = j
   842  			j++
   843  		}
   844  	}
   845  
   846  	for i := 0; i < count; i++ {
   847  		value := unsafe.Slice(valueAt(i), d.size)
   848  
   849  		index, exists := d.hashmap[string(value)]
   850  		if !exists {
   851  			index = int32(d.Len())
   852  			start := len(d.data)
   853  			d.data = append(d.data, value...)
   854  			d.hashmap[string(d.data[start:])] = index
   855  		}
   856  
   857  		indexes[i] = index
   858  	}
   859  }
   860  
   861  func (d *fixedLenByteArrayDictionary) Lookup(indexes []int32, values []Value) {
   862  	model := d.makeValueString("")
   863  	memsetValues(values, model)
   864  	d.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
   865  }
   866  
   867  func (d *fixedLenByteArrayDictionary) Bounds(indexes []int32) (min, max Value) {
   868  	if len(indexes) > 0 {
   869  		base := d.index(indexes[0])
   870  		minValue := unsafecast.BytesToString(base)
   871  		maxValue := minValue
   872  		values := [64]string{}
   873  
   874  		for i := 1; i < len(indexes); i += len(values) {
   875  			n := len(indexes) - i
   876  			if n > len(values) {
   877  				n = len(values)
   878  			}
   879  			j := i + n
   880  			d.lookupString(indexes[i:j:j], makeArrayString(values[:n:n]))
   881  
   882  			for _, value := range values[:n:n] {
   883  				switch {
   884  				case value < minValue:
   885  					minValue = value
   886  				case value > maxValue:
   887  					maxValue = value
   888  				}
   889  			}
   890  		}
   891  
   892  		min = d.makeValueString(minValue)
   893  		max = d.makeValueString(maxValue)
   894  	}
   895  	return min, max
   896  }
   897  
   898  func (d *fixedLenByteArrayDictionary) Reset() {
   899  	d.data = d.data[:0]
   900  	d.hashmap = nil
   901  }
   902  
   903  func (d *fixedLenByteArrayDictionary) Page() BufferedPage {
   904  	return &d.fixedLenByteArrayPage
   905  }
   906  
   907  type uint32Dictionary struct {
   908  	uint32Page
   909  	table *hashprobe.Uint32Table
   910  }
   911  
   912  func newUint32Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) *uint32Dictionary {
   913  	return &uint32Dictionary{
   914  		uint32Page: uint32Page{
   915  			typ:         typ,
   916  			values:      unsafecast.BytesToUint32(data)[:numValues],
   917  			columnIndex: ^columnIndex,
   918  		},
   919  	}
   920  }
   921  
   922  func (d *uint32Dictionary) Type() Type { return newIndexedType(d.typ, d) }
   923  
   924  func (d *uint32Dictionary) Len() int { return len(d.values) }
   925  
   926  func (d *uint32Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
   927  
   928  func (d *uint32Dictionary) index(i int32) uint32 { return d.values[i] }
   929  
   930  func (d *uint32Dictionary) Insert(indexes []int32, values []Value) {
   931  	model := Value{}
   932  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   933  }
   934  
   935  func (d *uint32Dictionary) init(indexes []int32) {
   936  	d.table = hashprobe.NewUint32Table(cap(d.values), hashprobeTableMaxLoad)
   937  
   938  	n := min(len(d.values), len(indexes))
   939  
   940  	for i := 0; i < len(d.values); i += n {
   941  		j := min(i+n, len(d.values))
   942  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
   943  	}
   944  }
   945  
   946  func (d *uint32Dictionary) insert(indexes []int32, rows sparse.Array) {
   947  	const chunkSize = insertsTargetCacheFootprint / 4
   948  
   949  	if d.table == nil {
   950  		d.init(indexes)
   951  	}
   952  
   953  	values := rows.Uint32Array()
   954  
   955  	for i := 0; i < values.Len(); i += chunkSize {
   956  		j := min(i+chunkSize, values.Len())
   957  
   958  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
   959  			for k, index := range indexes[i:j] {
   960  				if index == int32(len(d.values)) {
   961  					d.values = append(d.values, values.Index(i+k))
   962  				}
   963  			}
   964  		}
   965  	}
   966  }
   967  
   968  func (d *uint32Dictionary) Lookup(indexes []int32, values []Value) {
   969  	model := d.makeValue(0)
   970  	memsetValues(values, model)
   971  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
   972  }
   973  
   974  func (d *uint32Dictionary) Bounds(indexes []int32) (min, max Value) {
   975  	if len(indexes) > 0 {
   976  		minValue, maxValue := d.bounds(indexes)
   977  		min = d.makeValue(minValue)
   978  		max = d.makeValue(maxValue)
   979  	}
   980  	return min, max
   981  }
   982  
   983  func (d *uint32Dictionary) Reset() {
   984  	d.values = d.values[:0]
   985  	if d.table != nil {
   986  		d.table.Reset()
   987  	}
   988  }
   989  
   990  func (d *uint32Dictionary) Page() BufferedPage {
   991  	return &d.uint32Page
   992  }
   993  
   994  type uint64Dictionary struct {
   995  	uint64Page
   996  	table *hashprobe.Uint64Table
   997  }
   998  
   999  func newUint64Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) *uint64Dictionary {
  1000  	return &uint64Dictionary{
  1001  		uint64Page: uint64Page{
  1002  			typ:         typ,
  1003  			values:      unsafecast.BytesToUint64(data),
  1004  			columnIndex: ^columnIndex,
  1005  		},
  1006  	}
  1007  }
  1008  
  1009  func (d *uint64Dictionary) Type() Type { return newIndexedType(d.typ, d) }
  1010  
  1011  func (d *uint64Dictionary) Len() int { return len(d.values) }
  1012  
  1013  func (d *uint64Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
  1014  
  1015  func (d *uint64Dictionary) index(i int32) uint64 { return d.values[i] }
  1016  
  1017  func (d *uint64Dictionary) Insert(indexes []int32, values []Value) {
  1018  	model := Value{}
  1019  	d.insert(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
  1020  }
  1021  
  1022  func (d *uint64Dictionary) init(indexes []int32) {
  1023  	d.table = hashprobe.NewUint64Table(cap(d.values), hashprobeTableMaxLoad)
  1024  
  1025  	n := min(len(d.values), len(indexes))
  1026  
  1027  	for i := 0; i < len(d.values); i += n {
  1028  		j := min(i+n, len(d.values))
  1029  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
  1030  	}
  1031  }
  1032  
  1033  func (d *uint64Dictionary) insert(indexes []int32, rows sparse.Array) {
  1034  	const chunkSize = insertsTargetCacheFootprint / 8
  1035  
  1036  	if d.table == nil {
  1037  		d.init(indexes)
  1038  	}
  1039  
  1040  	values := rows.Uint64Array()
  1041  
  1042  	for i := 0; i < values.Len(); i += chunkSize {
  1043  		j := min(i+chunkSize, values.Len())
  1044  
  1045  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
  1046  			for k, index := range indexes[i:j] {
  1047  				if index == int32(len(d.values)) {
  1048  					d.values = append(d.values, values.Index(i+k))
  1049  				}
  1050  			}
  1051  		}
  1052  	}
  1053  }
  1054  
  1055  func (d *uint64Dictionary) Lookup(indexes []int32, values []Value) {
  1056  	model := d.makeValue(0)
  1057  	memsetValues(values, model)
  1058  	d.lookup(indexes, makeArrayValue(values, unsafe.Offsetof(model.u64)))
  1059  }
  1060  
  1061  func (d *uint64Dictionary) Bounds(indexes []int32) (min, max Value) {
  1062  	if len(indexes) > 0 {
  1063  		minValue, maxValue := d.bounds(indexes)
  1064  		min = d.makeValue(minValue)
  1065  		max = d.makeValue(maxValue)
  1066  	}
  1067  	return min, max
  1068  }
  1069  
  1070  func (d *uint64Dictionary) Reset() {
  1071  	d.values = d.values[:0]
  1072  	if d.table != nil {
  1073  		d.table.Reset()
  1074  	}
  1075  }
  1076  
  1077  func (d *uint64Dictionary) Page() BufferedPage {
  1078  	return &d.uint64Page
  1079  }
  1080  
  1081  type be128Dictionary struct {
  1082  	be128Page
  1083  	table *hashprobe.Uint128Table
  1084  }
  1085  
  1086  func newBE128Dictionary(typ Type, columnIndex int16, numValues int32, data []byte) *be128Dictionary {
  1087  	return &be128Dictionary{
  1088  		be128Page: be128Page{
  1089  			typ:         typ,
  1090  			values:      unsafecast.BytesToUint128(data),
  1091  			columnIndex: ^columnIndex,
  1092  		},
  1093  	}
  1094  }
  1095  
  1096  func (d *be128Dictionary) Type() Type { return newIndexedType(d.typ, d) }
  1097  
  1098  func (d *be128Dictionary) Len() int { return len(d.values) }
  1099  
  1100  func (d *be128Dictionary) Index(i int32) Value { return d.makeValue(d.index(i)) }
  1101  
  1102  func (d *be128Dictionary) index(i int32) *[16]byte { return &d.values[i] }
  1103  
  1104  func (d *be128Dictionary) Insert(indexes []int32, values []Value) {
  1105  	_ = indexes[:len(values)]
  1106  
  1107  	for _, v := range values {
  1108  		if v.kind != ^int8(FixedLenByteArray) {
  1109  			panic("values inserted in BE128 dictionary must be of type BYTE_ARRAY")
  1110  		}
  1111  		if v.u64 != 16 {
  1112  			panic("values inserted in BE128 dictionary must be of length 16")
  1113  		}
  1114  	}
  1115  
  1116  	if d.table == nil {
  1117  		d.init(indexes)
  1118  	}
  1119  
  1120  	const chunkSize = insertsTargetCacheFootprint / 16
  1121  	var buffer [chunkSize][16]byte
  1122  
  1123  	for i := 0; i < len(values); i += chunkSize {
  1124  		j := min(chunkSize+i, len(values))
  1125  		n := min(chunkSize, len(values)-i)
  1126  
  1127  		probe := buffer[:n:n]
  1128  		writePointersBE128(probe, makeArrayValue(values[i:j], unsafe.Offsetof(values[i].ptr)))
  1129  
  1130  		if d.table.Probe(probe, indexes[i:j:j]) > 0 {
  1131  			for k, v := range probe {
  1132  				if indexes[i+k] == int32(len(d.values)) {
  1133  					d.values = append(d.values, v)
  1134  				}
  1135  			}
  1136  		}
  1137  	}
  1138  }
  1139  
  1140  func (d *be128Dictionary) init(indexes []int32) {
  1141  	d.table = hashprobe.NewUint128Table(cap(d.values), 0.75)
  1142  
  1143  	n := min(len(d.values), len(indexes))
  1144  
  1145  	for i := 0; i < len(d.values); i += n {
  1146  		j := min(i+n, len(d.values))
  1147  		d.table.Probe(d.values[i:j:j], indexes[:n:n])
  1148  	}
  1149  }
  1150  
  1151  func (d *be128Dictionary) insert(indexes []int32, rows sparse.Array) {
  1152  	const chunkSize = insertsTargetCacheFootprint / 16
  1153  
  1154  	if d.table == nil {
  1155  		d.init(indexes)
  1156  	}
  1157  
  1158  	values := rows.Uint128Array()
  1159  
  1160  	for i := 0; i < values.Len(); i += chunkSize {
  1161  		j := min(i+chunkSize, values.Len())
  1162  
  1163  		if d.table.ProbeArray(values.Slice(i, j), indexes[i:j:j]) > 0 {
  1164  			for k, index := range indexes[i:j] {
  1165  				if index == int32(len(d.values)) {
  1166  					d.values = append(d.values, values.Index(i+k))
  1167  				}
  1168  			}
  1169  		}
  1170  	}
  1171  }
  1172  
  1173  func (d *be128Dictionary) Lookup(indexes []int32, values []Value) {
  1174  	model := d.makeValueString("")
  1175  	memsetValues(values, model)
  1176  	d.lookupString(indexes, makeArrayValue(values, unsafe.Offsetof(model.ptr)))
  1177  }
  1178  
  1179  func (d *be128Dictionary) Bounds(indexes []int32) (min, max Value) {
  1180  	if len(indexes) > 0 {
  1181  		minValue, maxValue := d.bounds(indexes)
  1182  		min = d.makeValue(minValue)
  1183  		max = d.makeValue(maxValue)
  1184  	}
  1185  	return min, max
  1186  }
  1187  
  1188  func (d *be128Dictionary) Reset() {
  1189  	d.values = d.values[:0]
  1190  	if d.table != nil {
  1191  		d.table.Reset()
  1192  	}
  1193  }
  1194  
  1195  func (d *be128Dictionary) Page() BufferedPage {
  1196  	return &d.be128Page
  1197  }
  1198  
  1199  // indexedType is a wrapper around a Type value which overrides object
  1200  // constructors to use indexed versions referencing values in the dictionary
  1201  // instead of storing plain values.
  1202  type indexedType struct {
  1203  	Type
  1204  	dict Dictionary
  1205  }
  1206  
  1207  func newIndexedType(typ Type, dict Dictionary) *indexedType {
  1208  	return &indexedType{Type: typ, dict: dict}
  1209  }
  1210  
  1211  func (t *indexedType) NewColumnBuffer(columnIndex, numValues int) ColumnBuffer {
  1212  	return newIndexedColumnBuffer(t, makeColumnIndex(columnIndex), makeNumValues(numValues))
  1213  }
  1214  
  1215  func (t *indexedType) NewPage(columnIndex, numValues int, data []byte) Page {
  1216  	return newIndexedPage(t, makeColumnIndex(columnIndex), makeNumValues(numValues), data)
  1217  }
  1218  
  1219  // indexedPage is an implementation of the BufferedPage interface which stores
  1220  // indexes instead of plain value. The indexes reference the values in a
  1221  // dictionary that the page was created for.
  1222  type indexedPage struct {
  1223  	typ         *indexedType
  1224  	values      []int32
  1225  	columnIndex int16
  1226  }
  1227  
  1228  func newIndexedPage(typ *indexedType, columnIndex int16, numValues int32, values []byte) *indexedPage {
  1229  	// RLE encoded values that contain dictionary indexes in data pages are
  1230  	// sometimes truncated when they contain only zeros. We account for this
  1231  	// special case here and extend the values buffer if it is shorter than
  1232  	// needed to hold `numValues`.
  1233  	size := 4 * int(numValues)
  1234  
  1235  	if len(values) < size {
  1236  		if cap(values) < size {
  1237  			tmp := make([]byte, size)
  1238  			copy(tmp, values)
  1239  			values = tmp
  1240  		} else {
  1241  			clear := values[len(values) : len(values)+size]
  1242  			for i := range clear {
  1243  				clear[i] = 0
  1244  			}
  1245  		}
  1246  	}
  1247  
  1248  	return &indexedPage{
  1249  		typ:         typ,
  1250  		values:      unsafecast.BytesToInt32(values[:size]),
  1251  		columnIndex: ^columnIndex,
  1252  	}
  1253  }
  1254  
  1255  func (page *indexedPage) Type() Type { return indexedPageType{page.typ} }
  1256  
  1257  func (page *indexedPage) Column() int { return int(^page.columnIndex) }
  1258  
  1259  func (page *indexedPage) Dictionary() Dictionary { return page.typ.dict }
  1260  
  1261  func (page *indexedPage) NumRows() int64 { return int64(len(page.values)) }
  1262  
  1263  func (page *indexedPage) NumValues() int64 { return int64(len(page.values)) }
  1264  
  1265  func (page *indexedPage) NumNulls() int64 { return 0 }
  1266  
  1267  func (page *indexedPage) Size() int64 { return 4 * int64(len(page.values)) }
  1268  
  1269  func (page *indexedPage) RepetitionLevels() []byte { return nil }
  1270  
  1271  func (page *indexedPage) DefinitionLevels() []byte { return nil }
  1272  
  1273  func (page *indexedPage) Data() []byte { return unsafecast.Int32ToBytes(page.values) }
  1274  
  1275  func (page *indexedPage) Values() ValueReader { return &indexedPageValues{page: page} }
  1276  
  1277  func (page *indexedPage) Buffer() BufferedPage { return page }
  1278  
  1279  func (page *indexedPage) Bounds() (min, max Value, ok bool) {
  1280  	if ok = len(page.values) > 0; ok {
  1281  		min, max = page.typ.dict.Bounds(page.values)
  1282  		min.columnIndex = page.columnIndex
  1283  		max.columnIndex = page.columnIndex
  1284  	}
  1285  	return min, max, ok
  1286  }
  1287  
  1288  func (page *indexedPage) Clone() BufferedPage {
  1289  	return &indexedPage{
  1290  		typ:         page.typ,
  1291  		values:      append([]int32{}, page.values...),
  1292  		columnIndex: page.columnIndex,
  1293  	}
  1294  }
  1295  
  1296  func (page *indexedPage) Slice(i, j int64) BufferedPage {
  1297  	return &indexedPage{
  1298  		typ:         page.typ,
  1299  		values:      page.values[i:j],
  1300  		columnIndex: page.columnIndex,
  1301  	}
  1302  }
  1303  
  1304  // indexedPageType is an adapter for the indexedType returned when accessing
  1305  // the type of an indexedPage value. It overrides the Encode/Decode methods to
  1306  // account for the fact that an indexed page is holding indexes of values into
  1307  // its dictionary instead of plain values.
  1308  type indexedPageType struct{ *indexedType }
  1309  
  1310  func (t indexedPageType) Encode(dst, src []byte, enc encoding.Encoding) ([]byte, error) {
  1311  	return enc.EncodeInt32(dst, src)
  1312  }
  1313  
  1314  func (t indexedPageType) Decode(dst, src []byte, enc encoding.Encoding) ([]byte, error) {
  1315  	return enc.DecodeInt32(dst, src)
  1316  }
  1317  
  1318  type indexedPageValues struct {
  1319  	page   *indexedPage
  1320  	offset int
  1321  }
  1322  
  1323  func (r *indexedPageValues) ReadValues(values []Value) (n int, err error) {
  1324  	if n = len(r.page.values) - r.offset; n == 0 {
  1325  		return 0, io.EOF
  1326  	}
  1327  	if n > len(values) {
  1328  		n = len(values)
  1329  	}
  1330  	r.page.typ.dict.Lookup(r.page.values[r.offset:r.offset+n], values[:n])
  1331  	r.offset += n
  1332  	if r.offset == len(r.page.values) {
  1333  		err = io.EOF
  1334  	}
  1335  	return n, err
  1336  }
  1337  
  1338  // indexedColumnBuffer is an implementation of the ColumnBuffer interface which
  1339  // builds a page of indexes into a parent dictionary when values are written.
  1340  type indexedColumnBuffer struct{ indexedPage }
  1341  
  1342  func newIndexedColumnBuffer(typ *indexedType, columnIndex int16, numValues int32) *indexedColumnBuffer {
  1343  	return &indexedColumnBuffer{
  1344  		indexedPage: indexedPage{
  1345  			typ:         typ,
  1346  			values:      make([]int32, 0, numValues),
  1347  			columnIndex: ^columnIndex,
  1348  		},
  1349  	}
  1350  }
  1351  
  1352  func (col *indexedColumnBuffer) Clone() ColumnBuffer {
  1353  	return &indexedColumnBuffer{
  1354  		indexedPage: indexedPage{
  1355  			typ:         col.typ,
  1356  			values:      append([]int32{}, col.values...),
  1357  			columnIndex: col.columnIndex,
  1358  		},
  1359  	}
  1360  }
  1361  
  1362  func (col *indexedColumnBuffer) ColumnIndex() ColumnIndex { return indexedColumnIndex{col} }
  1363  
  1364  func (col *indexedColumnBuffer) OffsetIndex() OffsetIndex { return indexedOffsetIndex{col} }
  1365  
  1366  func (col *indexedColumnBuffer) BloomFilter() BloomFilter { return nil }
  1367  
  1368  func (col *indexedColumnBuffer) Dictionary() Dictionary { return col.typ.dict }
  1369  
  1370  func (col *indexedColumnBuffer) Pages() Pages { return onePage(col.Page()) }
  1371  
  1372  func (col *indexedColumnBuffer) Page() BufferedPage { return &col.indexedPage }
  1373  
  1374  func (col *indexedColumnBuffer) Reset() { col.values = col.values[:0] }
  1375  
  1376  func (col *indexedColumnBuffer) Cap() int { return cap(col.values) }
  1377  
  1378  func (col *indexedColumnBuffer) Len() int { return len(col.values) }
  1379  
  1380  func (col *indexedColumnBuffer) Less(i, j int) bool {
  1381  	u := col.typ.dict.Index(col.values[i])
  1382  	v := col.typ.dict.Index(col.values[j])
  1383  	return col.typ.Compare(u, v) < 0
  1384  }
  1385  
  1386  func (col *indexedColumnBuffer) Swap(i, j int) {
  1387  	col.values[i], col.values[j] = col.values[j], col.values[i]
  1388  }
  1389  
  1390  func (col *indexedColumnBuffer) WriteValues(values []Value) (int, error) {
  1391  	i := len(col.values)
  1392  	j := len(col.values) + len(values)
  1393  
  1394  	if j <= cap(col.values) {
  1395  		col.values = col.values[:j]
  1396  	} else {
  1397  		tmp := make([]int32, j, 2*j)
  1398  		copy(tmp, col.values)
  1399  		col.values = tmp
  1400  	}
  1401  
  1402  	col.typ.dict.Insert(col.values[i:], values)
  1403  	return len(values), nil
  1404  }
  1405  
  1406  func (col *indexedColumnBuffer) writeValues(rows sparse.Array, _ columnLevels) {
  1407  	i := len(col.values)
  1408  	j := len(col.values) + rows.Len()
  1409  
  1410  	if j <= cap(col.values) {
  1411  		col.values = col.values[:j]
  1412  	} else {
  1413  		tmp := make([]int32, j, 2*j)
  1414  		copy(tmp, col.values)
  1415  		col.values = tmp
  1416  	}
  1417  
  1418  	col.typ.dict.insert(col.values[i:], rows)
  1419  }
  1420  
  1421  func (col *indexedColumnBuffer) ReadValuesAt(values []Value, offset int64) (n int, err error) {
  1422  	i := int(offset)
  1423  	switch {
  1424  	case i < 0:
  1425  		return 0, errRowIndexOutOfBounds(offset, int64(len(col.values)))
  1426  	case i >= len(col.values):
  1427  		return 0, io.EOF
  1428  	default:
  1429  		for n < len(values) && i < len(col.values) {
  1430  			values[n] = col.typ.dict.Index(col.values[i])
  1431  			values[n].columnIndex = col.columnIndex
  1432  			n++
  1433  			i++
  1434  		}
  1435  		if n < len(values) {
  1436  			err = io.EOF
  1437  		}
  1438  		return n, err
  1439  	}
  1440  }
  1441  
  1442  func (col *indexedColumnBuffer) ReadRowAt(row Row, index int64) (Row, error) {
  1443  	switch {
  1444  	case index < 0:
  1445  		return row, errRowIndexOutOfBounds(index, int64(len(col.values)))
  1446  	case index >= int64(len(col.values)):
  1447  		return row, io.EOF
  1448  	default:
  1449  		v := col.typ.dict.Index(col.values[index])
  1450  		v.columnIndex = col.columnIndex
  1451  		return append(row, v), nil
  1452  	}
  1453  }
  1454  
  1455  type indexedColumnIndex struct{ col *indexedColumnBuffer }
  1456  
  1457  func (index indexedColumnIndex) NumPages() int       { return 1 }
  1458  func (index indexedColumnIndex) NullCount(int) int64 { return 0 }
  1459  func (index indexedColumnIndex) NullPage(int) bool   { return false }
  1460  func (index indexedColumnIndex) MinValue(int) Value {
  1461  	min, _, _ := index.col.Bounds()
  1462  	return min
  1463  }
  1464  func (index indexedColumnIndex) MaxValue(int) Value {
  1465  	_, max, _ := index.col.Bounds()
  1466  	return max
  1467  }
  1468  func (index indexedColumnIndex) IsAscending() bool {
  1469  	min, max, _ := index.col.Bounds()
  1470  	return index.col.typ.Compare(min, max) <= 0
  1471  }
  1472  func (index indexedColumnIndex) IsDescending() bool {
  1473  	min, max, _ := index.col.Bounds()
  1474  	return index.col.typ.Compare(min, max) > 0
  1475  }
  1476  
  1477  type indexedOffsetIndex struct{ col *indexedColumnBuffer }
  1478  
  1479  func (index indexedOffsetIndex) NumPages() int                { return 1 }
  1480  func (index indexedOffsetIndex) Offset(int) int64             { return 0 }
  1481  func (index indexedOffsetIndex) CompressedPageSize(int) int64 { return index.col.Size() }
  1482  func (index indexedOffsetIndex) FirstRowIndex(int) int64      { return 0 }