github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/hashing/xxh3_memo_table.gen.go (about)

     1  // Code generated by xxh3_memo_table.gen.go.tmpl. DO NOT EDIT.
     2  
     3  // Licensed to the Apache Software Foundation (ASF) under one
     4  // or more contributor license agreements.  See the NOTICE file
     5  // distributed with this work for additional information
     6  // regarding copyright ownership.  The ASF licenses this file
     7  // to you under the Apache License, Version 2.0 (the
     8  // "License"); you may not use this file except in compliance
     9  // with the License.  You may obtain a copy of the License at
    10  //
    11  // http://www.apache.org/licenses/LICENSE-2.0
    12  //
    13  // Unless required by applicable law or agreed to in writing, software
    14  // distributed under the License is distributed on an "AS IS" BASIS,
    15  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  // See the License for the specific language governing permissions and
    17  // limitations under the License.
    18  
    19  package hashing
    20  
    21  import (
    22  	"math"
    23  
    24  	"github.com/apache/arrow/go/v7/arrow"
    25  	"github.com/apache/arrow/go/v7/arrow/bitutil"
    26  	"github.com/apache/arrow/go/v7/parquet/internal/utils"
    27  )
    28  
    29  type payloadInt32 struct {
    30  	val     int32
    31  	memoIdx int32
    32  }
    33  
    34  type entryInt32 struct {
    35  	h       uint64
    36  	payload payloadInt32
    37  }
    38  
    39  func (e entryInt32) Valid() bool { return e.h != sentinel }
    40  
    41  // Int32HashTable is a hashtable specifically for int32 that
    42  // is utilized with the MemoTable to generalize interactions for easier
    43  // implementation of dictionaries without losing performance.
    44  type Int32HashTable struct {
    45  	cap     uint64
    46  	capMask uint64
    47  	size    uint64
    48  
    49  	entries []entryInt32
    50  }
    51  
    52  // NewInt32HashTable returns a new hash table for int32 values
    53  // initialized with the passed in capacity or 32 whichever is larger.
    54  func NewInt32HashTable(cap uint64) *Int32HashTable {
    55  	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
    56  	ret := &Int32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
    57  	ret.entries = make([]entryInt32, initCap)
    58  	return ret
    59  }
    60  
    61  // Reset drops all of the values in this hash table and re-initializes it
    62  // with the specified initial capacity as if by calling New, but without having
    63  // to reallocate the object.
    64  func (h *Int32HashTable) Reset(cap uint64) {
    65  	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
    66  	h.capMask = h.cap - 1
    67  	h.size = 0
    68  	h.entries = make([]entryInt32, h.cap)
    69  }
    70  
    71  // CopyValues is used for copying the values out of the hash table into the
    72  // passed in slice, in the order that they were first inserted
    73  func (h *Int32HashTable) CopyValues(out []int32) {
    74  	h.CopyValuesSubset(0, out)
    75  }
    76  
    77  // CopyValuesSubset copies a subset of the values in the hashtable out, starting
    78  // with the value at start, in the order that they were inserted.
    79  func (h *Int32HashTable) CopyValuesSubset(start int, out []int32) {
    80  	h.VisitEntries(func(e *entryInt32) {
    81  		idx := e.payload.memoIdx - int32(start)
    82  		if idx >= 0 {
    83  			out[idx] = e.payload.val
    84  		}
    85  	})
    86  }
    87  
    88  func (h *Int32HashTable) WriteOut(out []byte) {
    89  	h.WriteOutSubset(0, out)
    90  }
    91  
    92  func (h *Int32HashTable) WriteOutSubset(start int, out []byte) {
    93  	data := arrow.Int32Traits.CastFromBytes(out)
    94  	h.VisitEntries(func(e *entryInt32) {
    95  		idx := e.payload.memoIdx - int32(start)
    96  		if idx >= 0 {
    97  			data[idx] = utils.ToLEInt32(e.payload.val)
    98  		}
    99  	})
   100  }
   101  
   102  func (h *Int32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
   103  
   104  func (Int32HashTable) fixHash(v uint64) uint64 {
   105  	if v == sentinel {
   106  		return 42
   107  	}
   108  	return v
   109  }
   110  
   111  // Lookup retrieves the entry for a given hash value assuming it's payload value returns
   112  // true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
   113  // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
   114  func (h *Int32HashTable) Lookup(v uint64, cmp func(int32) bool) (*entryInt32, bool) {
   115  	idx, ok := h.lookup(v, h.capMask, cmp)
   116  	return &h.entries[idx], ok
   117  }
   118  
   119  func (h *Int32HashTable) lookup(v uint64, szMask uint64, cmp func(int32) bool) (uint64, bool) {
   120  	const perturbShift uint8 = 5
   121  
   122  	var (
   123  		idx     uint64
   124  		perturb uint64
   125  		e       *entryInt32
   126  	)
   127  
   128  	v = h.fixHash(v)
   129  	idx = v & szMask
   130  	perturb = (v >> uint64(perturbShift)) + 1
   131  
   132  	for {
   133  		e = &h.entries[idx]
   134  		if e.h == v && cmp(e.payload.val) {
   135  			return idx, true
   136  		}
   137  
   138  		if e.h == sentinel {
   139  			return idx, false
   140  		}
   141  
   142  		// perturbation logic inspired from CPython's set/dict object
   143  		// the goal is that all 64 bits of unmasked hash value eventually
   144  		// participate int he probing sequence, to minimize clustering
   145  		idx = (idx + perturb) & szMask
   146  		perturb = (perturb >> uint64(perturbShift)) + 1
   147  	}
   148  }
   149  
   150  func (h *Int32HashTable) upsize(newcap uint64) error {
   151  	newMask := newcap - 1
   152  
   153  	oldEntries := h.entries
   154  	h.entries = make([]entryInt32, newcap)
   155  	for _, e := range oldEntries {
   156  		if e.Valid() {
   157  			idx, _ := h.lookup(e.h, newMask, func(int32) bool { return false })
   158  			h.entries[idx] = e
   159  		}
   160  	}
   161  	h.cap = newcap
   162  	h.capMask = newMask
   163  	return nil
   164  }
   165  
   166  // Insert updates the given entry with the provided hash value, payload value and memo index.
   167  // The entry pointer must have been retrieved via lookup in order to actually insert properly.
   168  func (h *Int32HashTable) Insert(e *entryInt32, v uint64, val int32, memoIdx int32) error {
   169  	e.h = h.fixHash(v)
   170  	e.payload.val = val
   171  	e.payload.memoIdx = memoIdx
   172  	h.size++
   173  
   174  	if h.needUpsize() {
   175  		h.upsize(h.cap * uint64(loadFactor) * 2)
   176  	}
   177  	return nil
   178  }
   179  
   180  // VisitEntries will call the passed in function on each *valid* entry in the hash table,
   181  // a valid entry being one which has had a value inserted into it.
   182  func (h *Int32HashTable) VisitEntries(visit func(*entryInt32)) {
   183  	for _, e := range h.entries {
   184  		if e.Valid() {
   185  			visit(&e)
   186  		}
   187  	}
   188  }
   189  
   190  // Int32MemoTable is a wrapper over the appropriate hashtable to provide an interface
   191  // conforming to the MemoTable interface defined in the encoding package for general interactions
   192  // regarding dictionaries.
   193  type Int32MemoTable struct {
   194  	tbl     *Int32HashTable
   195  	nullIdx int32
   196  }
   197  
   198  // NewInt32MemoTable returns a new memotable with num entries pre-allocated to reduce further
   199  // allocations when inserting.
   200  func NewInt32MemoTable(num int64) *Int32MemoTable {
   201  	return &Int32MemoTable{tbl: NewInt32HashTable(uint64(num)), nullIdx: KeyNotFound}
   202  }
   203  
   204  // Reset allows this table to be re-used by dumping all the data currently in the table.
   205  func (s *Int32MemoTable) Reset() {
   206  	s.tbl.Reset(32)
   207  	s.nullIdx = KeyNotFound
   208  }
   209  
   210  // Size returns the current number of inserted elements into the table including if a null
   211  // has been inserted.
   212  func (s *Int32MemoTable) Size() int {
   213  	sz := int(s.tbl.size)
   214  	if _, ok := s.GetNull(); ok {
   215  		sz++
   216  	}
   217  	return sz
   218  }
   219  
   220  // GetNull returns the index of an inserted null or KeyNotFound along with a bool
   221  // that will be true if found and false if not.
   222  func (s *Int32MemoTable) GetNull() (int, bool) {
   223  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   224  }
   225  
   226  // GetOrInsertNull will return the index of the null entry or insert a null entry
   227  // if one currently doesn't exist. The found value will be true if there was already
   228  // a null in the table, and false if it inserted one.
   229  func (s *Int32MemoTable) GetOrInsertNull() (idx int, found bool) {
   230  	idx, found = s.GetNull()
   231  	if !found {
   232  		idx = s.Size()
   233  		s.nullIdx = int32(idx)
   234  	}
   235  	return
   236  }
   237  
   238  // CopyValues will copy the values from the memo table out into the passed in slice
   239  // which must be of the appropriate type.
   240  func (s *Int32MemoTable) CopyValues(out interface{}) {
   241  	s.CopyValuesSubset(0, out)
   242  }
   243  
   244  // CopyValuesSubset is like CopyValues but only copies a subset of values starting
   245  // at the provided start index
   246  func (s *Int32MemoTable) CopyValuesSubset(start int, out interface{}) {
   247  	s.tbl.CopyValuesSubset(start, out.([]int32))
   248  }
   249  
   250  func (s *Int32MemoTable) WriteOut(out []byte) {
   251  	s.tbl.WriteOut(out)
   252  }
   253  
   254  func (s *Int32MemoTable) WriteOutSubset(start int, out []byte) {
   255  	s.tbl.WriteOutSubset(start, out)
   256  }
   257  
   258  // Get returns the index of the requested value in the hash table or KeyNotFound
   259  // along with a boolean indicating if it was found or not.
   260  func (s *Int32MemoTable) Get(val interface{}) (int, bool) {
   261  
   262  	h := hashInt(uint64(val.(int32)), 0)
   263  	if e, ok := s.tbl.Lookup(h, func(v int32) bool { return val.(int32) == v }); ok {
   264  		return int(e.payload.memoIdx), ok
   265  	}
   266  	return KeyNotFound, false
   267  }
   268  
   269  // GetOrInsert will return the index of the specified value in the table, or insert the
   270  // value into the table and return the new index. found indicates whether or not it already
   271  // existed in the table (true) or was inserted by this call (false).
   272  func (s *Int32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   273  
   274  	h := hashInt(uint64(val.(int32)), 0)
   275  	e, ok := s.tbl.Lookup(h, func(v int32) bool {
   276  		return val.(int32) == v
   277  	})
   278  
   279  	if ok {
   280  		idx = int(e.payload.memoIdx)
   281  		found = true
   282  	} else {
   283  		idx = s.Size()
   284  		s.tbl.Insert(e, h, val.(int32), int32(idx))
   285  	}
   286  	return
   287  }
   288  
   289  type payloadInt64 struct {
   290  	val     int64
   291  	memoIdx int32
   292  }
   293  
   294  type entryInt64 struct {
   295  	h       uint64
   296  	payload payloadInt64
   297  }
   298  
   299  func (e entryInt64) Valid() bool { return e.h != sentinel }
   300  
   301  // Int64HashTable is a hashtable specifically for int64 that
   302  // is utilized with the MemoTable to generalize interactions for easier
   303  // implementation of dictionaries without losing performance.
   304  type Int64HashTable struct {
   305  	cap     uint64
   306  	capMask uint64
   307  	size    uint64
   308  
   309  	entries []entryInt64
   310  }
   311  
   312  // NewInt64HashTable returns a new hash table for int64 values
   313  // initialized with the passed in capacity or 32 whichever is larger.
   314  func NewInt64HashTable(cap uint64) *Int64HashTable {
   315  	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
   316  	ret := &Int64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
   317  	ret.entries = make([]entryInt64, initCap)
   318  	return ret
   319  }
   320  
   321  // Reset drops all of the values in this hash table and re-initializes it
   322  // with the specified initial capacity as if by calling New, but without having
   323  // to reallocate the object.
   324  func (h *Int64HashTable) Reset(cap uint64) {
   325  	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
   326  	h.capMask = h.cap - 1
   327  	h.size = 0
   328  	h.entries = make([]entryInt64, h.cap)
   329  }
   330  
   331  // CopyValues is used for copying the values out of the hash table into the
   332  // passed in slice, in the order that they were first inserted
   333  func (h *Int64HashTable) CopyValues(out []int64) {
   334  	h.CopyValuesSubset(0, out)
   335  }
   336  
   337  // CopyValuesSubset copies a subset of the values in the hashtable out, starting
   338  // with the value at start, in the order that they were inserted.
   339  func (h *Int64HashTable) CopyValuesSubset(start int, out []int64) {
   340  	h.VisitEntries(func(e *entryInt64) {
   341  		idx := e.payload.memoIdx - int32(start)
   342  		if idx >= 0 {
   343  			out[idx] = e.payload.val
   344  		}
   345  	})
   346  }
   347  
   348  func (h *Int64HashTable) WriteOut(out []byte) {
   349  	h.WriteOutSubset(0, out)
   350  }
   351  
   352  func (h *Int64HashTable) WriteOutSubset(start int, out []byte) {
   353  	data := arrow.Int64Traits.CastFromBytes(out)
   354  	h.VisitEntries(func(e *entryInt64) {
   355  		idx := e.payload.memoIdx - int32(start)
   356  		if idx >= 0 {
   357  			data[idx] = utils.ToLEInt64(e.payload.val)
   358  		}
   359  	})
   360  }
   361  
   362  func (h *Int64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
   363  
   364  func (Int64HashTable) fixHash(v uint64) uint64 {
   365  	if v == sentinel {
   366  		return 42
   367  	}
   368  	return v
   369  }
   370  
   371  // Lookup retrieves the entry for a given hash value assuming it's payload value returns
   372  // true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
   373  // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
   374  func (h *Int64HashTable) Lookup(v uint64, cmp func(int64) bool) (*entryInt64, bool) {
   375  	idx, ok := h.lookup(v, h.capMask, cmp)
   376  	return &h.entries[idx], ok
   377  }
   378  
   379  func (h *Int64HashTable) lookup(v uint64, szMask uint64, cmp func(int64) bool) (uint64, bool) {
   380  	const perturbShift uint8 = 5
   381  
   382  	var (
   383  		idx     uint64
   384  		perturb uint64
   385  		e       *entryInt64
   386  	)
   387  
   388  	v = h.fixHash(v)
   389  	idx = v & szMask
   390  	perturb = (v >> uint64(perturbShift)) + 1
   391  
   392  	for {
   393  		e = &h.entries[idx]
   394  		if e.h == v && cmp(e.payload.val) {
   395  			return idx, true
   396  		}
   397  
   398  		if e.h == sentinel {
   399  			return idx, false
   400  		}
   401  
   402  		// perturbation logic inspired from CPython's set/dict object
   403  		// the goal is that all 64 bits of unmasked hash value eventually
   404  		// participate int he probing sequence, to minimize clustering
   405  		idx = (idx + perturb) & szMask
   406  		perturb = (perturb >> uint64(perturbShift)) + 1
   407  	}
   408  }
   409  
   410  func (h *Int64HashTable) upsize(newcap uint64) error {
   411  	newMask := newcap - 1
   412  
   413  	oldEntries := h.entries
   414  	h.entries = make([]entryInt64, newcap)
   415  	for _, e := range oldEntries {
   416  		if e.Valid() {
   417  			idx, _ := h.lookup(e.h, newMask, func(int64) bool { return false })
   418  			h.entries[idx] = e
   419  		}
   420  	}
   421  	h.cap = newcap
   422  	h.capMask = newMask
   423  	return nil
   424  }
   425  
   426  // Insert updates the given entry with the provided hash value, payload value and memo index.
   427  // The entry pointer must have been retrieved via lookup in order to actually insert properly.
   428  func (h *Int64HashTable) Insert(e *entryInt64, v uint64, val int64, memoIdx int32) error {
   429  	e.h = h.fixHash(v)
   430  	e.payload.val = val
   431  	e.payload.memoIdx = memoIdx
   432  	h.size++
   433  
   434  	if h.needUpsize() {
   435  		h.upsize(h.cap * uint64(loadFactor) * 2)
   436  	}
   437  	return nil
   438  }
   439  
   440  // VisitEntries will call the passed in function on each *valid* entry in the hash table,
   441  // a valid entry being one which has had a value inserted into it.
   442  func (h *Int64HashTable) VisitEntries(visit func(*entryInt64)) {
   443  	for _, e := range h.entries {
   444  		if e.Valid() {
   445  			visit(&e)
   446  		}
   447  	}
   448  }
   449  
   450  // Int64MemoTable is a wrapper over the appropriate hashtable to provide an interface
   451  // conforming to the MemoTable interface defined in the encoding package for general interactions
   452  // regarding dictionaries.
   453  type Int64MemoTable struct {
   454  	tbl     *Int64HashTable
   455  	nullIdx int32
   456  }
   457  
   458  // NewInt64MemoTable returns a new memotable with num entries pre-allocated to reduce further
   459  // allocations when inserting.
   460  func NewInt64MemoTable(num int64) *Int64MemoTable {
   461  	return &Int64MemoTable{tbl: NewInt64HashTable(uint64(num)), nullIdx: KeyNotFound}
   462  }
   463  
   464  // Reset allows this table to be re-used by dumping all the data currently in the table.
   465  func (s *Int64MemoTable) Reset() {
   466  	s.tbl.Reset(32)
   467  	s.nullIdx = KeyNotFound
   468  }
   469  
   470  // Size returns the current number of inserted elements into the table including if a null
   471  // has been inserted.
   472  func (s *Int64MemoTable) Size() int {
   473  	sz := int(s.tbl.size)
   474  	if _, ok := s.GetNull(); ok {
   475  		sz++
   476  	}
   477  	return sz
   478  }
   479  
   480  // GetNull returns the index of an inserted null or KeyNotFound along with a bool
   481  // that will be true if found and false if not.
   482  func (s *Int64MemoTable) GetNull() (int, bool) {
   483  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   484  }
   485  
   486  // GetOrInsertNull will return the index of the null entry or insert a null entry
   487  // if one currently doesn't exist. The found value will be true if there was already
   488  // a null in the table, and false if it inserted one.
   489  func (s *Int64MemoTable) GetOrInsertNull() (idx int, found bool) {
   490  	idx, found = s.GetNull()
   491  	if !found {
   492  		idx = s.Size()
   493  		s.nullIdx = int32(idx)
   494  	}
   495  	return
   496  }
   497  
   498  // CopyValues will copy the values from the memo table out into the passed in slice
   499  // which must be of the appropriate type.
   500  func (s *Int64MemoTable) CopyValues(out interface{}) {
   501  	s.CopyValuesSubset(0, out)
   502  }
   503  
   504  // CopyValuesSubset is like CopyValues but only copies a subset of values starting
   505  // at the provided start index
   506  func (s *Int64MemoTable) CopyValuesSubset(start int, out interface{}) {
   507  	s.tbl.CopyValuesSubset(start, out.([]int64))
   508  }
   509  
   510  func (s *Int64MemoTable) WriteOut(out []byte) {
   511  	s.tbl.WriteOut(out)
   512  }
   513  
   514  func (s *Int64MemoTable) WriteOutSubset(start int, out []byte) {
   515  	s.tbl.WriteOutSubset(start, out)
   516  }
   517  
   518  // Get returns the index of the requested value in the hash table or KeyNotFound
   519  // along with a boolean indicating if it was found or not.
   520  func (s *Int64MemoTable) Get(val interface{}) (int, bool) {
   521  
   522  	h := hashInt(uint64(val.(int64)), 0)
   523  	if e, ok := s.tbl.Lookup(h, func(v int64) bool { return val.(int64) == v }); ok {
   524  		return int(e.payload.memoIdx), ok
   525  	}
   526  	return KeyNotFound, false
   527  }
   528  
   529  // GetOrInsert will return the index of the specified value in the table, or insert the
   530  // value into the table and return the new index. found indicates whether or not it already
   531  // existed in the table (true) or was inserted by this call (false).
   532  func (s *Int64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   533  
   534  	h := hashInt(uint64(val.(int64)), 0)
   535  	e, ok := s.tbl.Lookup(h, func(v int64) bool {
   536  		return val.(int64) == v
   537  	})
   538  
   539  	if ok {
   540  		idx = int(e.payload.memoIdx)
   541  		found = true
   542  	} else {
   543  		idx = s.Size()
   544  		s.tbl.Insert(e, h, val.(int64), int32(idx))
   545  	}
   546  	return
   547  }
   548  
   549  type payloadFloat32 struct {
   550  	val     float32
   551  	memoIdx int32
   552  }
   553  
   554  type entryFloat32 struct {
   555  	h       uint64
   556  	payload payloadFloat32
   557  }
   558  
   559  func (e entryFloat32) Valid() bool { return e.h != sentinel }
   560  
   561  // Float32HashTable is a hashtable specifically for float32 that
   562  // is utilized with the MemoTable to generalize interactions for easier
   563  // implementation of dictionaries without losing performance.
   564  type Float32HashTable struct {
   565  	cap     uint64
   566  	capMask uint64
   567  	size    uint64
   568  
   569  	entries []entryFloat32
   570  }
   571  
   572  // NewFloat32HashTable returns a new hash table for float32 values
   573  // initialized with the passed in capacity or 32 whichever is larger.
   574  func NewFloat32HashTable(cap uint64) *Float32HashTable {
   575  	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
   576  	ret := &Float32HashTable{cap: initCap, capMask: initCap - 1, size: 0}
   577  	ret.entries = make([]entryFloat32, initCap)
   578  	return ret
   579  }
   580  
   581  // Reset drops all of the values in this hash table and re-initializes it
   582  // with the specified initial capacity as if by calling New, but without having
   583  // to reallocate the object.
   584  func (h *Float32HashTable) Reset(cap uint64) {
   585  	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
   586  	h.capMask = h.cap - 1
   587  	h.size = 0
   588  	h.entries = make([]entryFloat32, h.cap)
   589  }
   590  
   591  // CopyValues is used for copying the values out of the hash table into the
   592  // passed in slice, in the order that they were first inserted
   593  func (h *Float32HashTable) CopyValues(out []float32) {
   594  	h.CopyValuesSubset(0, out)
   595  }
   596  
   597  // CopyValuesSubset copies a subset of the values in the hashtable out, starting
   598  // with the value at start, in the order that they were inserted.
   599  func (h *Float32HashTable) CopyValuesSubset(start int, out []float32) {
   600  	h.VisitEntries(func(e *entryFloat32) {
   601  		idx := e.payload.memoIdx - int32(start)
   602  		if idx >= 0 {
   603  			out[idx] = e.payload.val
   604  		}
   605  	})
   606  }
   607  
   608  func (h *Float32HashTable) WriteOut(out []byte) {
   609  	h.WriteOutSubset(0, out)
   610  }
   611  
   612  func (h *Float32HashTable) WriteOutSubset(start int, out []byte) {
   613  	data := arrow.Float32Traits.CastFromBytes(out)
   614  	h.VisitEntries(func(e *entryFloat32) {
   615  		idx := e.payload.memoIdx - int32(start)
   616  		if idx >= 0 {
   617  			data[idx] = utils.ToLEFloat32(e.payload.val)
   618  		}
   619  	})
   620  }
   621  
   622  func (h *Float32HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
   623  
   624  func (Float32HashTable) fixHash(v uint64) uint64 {
   625  	if v == sentinel {
   626  		return 42
   627  	}
   628  	return v
   629  }
   630  
   631  // Lookup retrieves the entry for a given hash value assuming it's payload value returns
   632  // true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
   633  // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
   634  func (h *Float32HashTable) Lookup(v uint64, cmp func(float32) bool) (*entryFloat32, bool) {
   635  	idx, ok := h.lookup(v, h.capMask, cmp)
   636  	return &h.entries[idx], ok
   637  }
   638  
   639  func (h *Float32HashTable) lookup(v uint64, szMask uint64, cmp func(float32) bool) (uint64, bool) {
   640  	const perturbShift uint8 = 5
   641  
   642  	var (
   643  		idx     uint64
   644  		perturb uint64
   645  		e       *entryFloat32
   646  	)
   647  
   648  	v = h.fixHash(v)
   649  	idx = v & szMask
   650  	perturb = (v >> uint64(perturbShift)) + 1
   651  
   652  	for {
   653  		e = &h.entries[idx]
   654  		if e.h == v && cmp(e.payload.val) {
   655  			return idx, true
   656  		}
   657  
   658  		if e.h == sentinel {
   659  			return idx, false
   660  		}
   661  
   662  		// perturbation logic inspired from CPython's set/dict object
   663  		// the goal is that all 64 bits of unmasked hash value eventually
   664  		// participate int he probing sequence, to minimize clustering
   665  		idx = (idx + perturb) & szMask
   666  		perturb = (perturb >> uint64(perturbShift)) + 1
   667  	}
   668  }
   669  
   670  func (h *Float32HashTable) upsize(newcap uint64) error {
   671  	newMask := newcap - 1
   672  
   673  	oldEntries := h.entries
   674  	h.entries = make([]entryFloat32, newcap)
   675  	for _, e := range oldEntries {
   676  		if e.Valid() {
   677  			idx, _ := h.lookup(e.h, newMask, func(float32) bool { return false })
   678  			h.entries[idx] = e
   679  		}
   680  	}
   681  	h.cap = newcap
   682  	h.capMask = newMask
   683  	return nil
   684  }
   685  
   686  // Insert updates the given entry with the provided hash value, payload value and memo index.
   687  // The entry pointer must have been retrieved via lookup in order to actually insert properly.
   688  func (h *Float32HashTable) Insert(e *entryFloat32, v uint64, val float32, memoIdx int32) error {
   689  	e.h = h.fixHash(v)
   690  	e.payload.val = val
   691  	e.payload.memoIdx = memoIdx
   692  	h.size++
   693  
   694  	if h.needUpsize() {
   695  		h.upsize(h.cap * uint64(loadFactor) * 2)
   696  	}
   697  	return nil
   698  }
   699  
   700  // VisitEntries will call the passed in function on each *valid* entry in the hash table,
   701  // a valid entry being one which has had a value inserted into it.
   702  func (h *Float32HashTable) VisitEntries(visit func(*entryFloat32)) {
   703  	for _, e := range h.entries {
   704  		if e.Valid() {
   705  			visit(&e)
   706  		}
   707  	}
   708  }
   709  
   710  // Float32MemoTable is a wrapper over the appropriate hashtable to provide an interface
   711  // conforming to the MemoTable interface defined in the encoding package for general interactions
   712  // regarding dictionaries.
   713  type Float32MemoTable struct {
   714  	tbl     *Float32HashTable
   715  	nullIdx int32
   716  }
   717  
   718  // NewFloat32MemoTable returns a new memotable with num entries pre-allocated to reduce further
   719  // allocations when inserting.
   720  func NewFloat32MemoTable(num int64) *Float32MemoTable {
   721  	return &Float32MemoTable{tbl: NewFloat32HashTable(uint64(num)), nullIdx: KeyNotFound}
   722  }
   723  
   724  // Reset allows this table to be re-used by dumping all the data currently in the table.
   725  func (s *Float32MemoTable) Reset() {
   726  	s.tbl.Reset(32)
   727  	s.nullIdx = KeyNotFound
   728  }
   729  
   730  // Size returns the current number of inserted elements into the table including if a null
   731  // has been inserted.
   732  func (s *Float32MemoTable) Size() int {
   733  	sz := int(s.tbl.size)
   734  	if _, ok := s.GetNull(); ok {
   735  		sz++
   736  	}
   737  	return sz
   738  }
   739  
   740  // GetNull returns the index of an inserted null or KeyNotFound along with a bool
   741  // that will be true if found and false if not.
   742  func (s *Float32MemoTable) GetNull() (int, bool) {
   743  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   744  }
   745  
   746  // GetOrInsertNull will return the index of the null entry or insert a null entry
   747  // if one currently doesn't exist. The found value will be true if there was already
   748  // a null in the table, and false if it inserted one.
   749  func (s *Float32MemoTable) GetOrInsertNull() (idx int, found bool) {
   750  	idx, found = s.GetNull()
   751  	if !found {
   752  		idx = s.Size()
   753  		s.nullIdx = int32(idx)
   754  	}
   755  	return
   756  }
   757  
   758  // CopyValues will copy the values from the memo table out into the passed in slice
   759  // which must be of the appropriate type.
   760  func (s *Float32MemoTable) CopyValues(out interface{}) {
   761  	s.CopyValuesSubset(0, out)
   762  }
   763  
   764  // CopyValuesSubset is like CopyValues but only copies a subset of values starting
   765  // at the provided start index
   766  func (s *Float32MemoTable) CopyValuesSubset(start int, out interface{}) {
   767  	s.tbl.CopyValuesSubset(start, out.([]float32))
   768  }
   769  
   770  func (s *Float32MemoTable) WriteOut(out []byte) {
   771  	s.tbl.WriteOut(out)
   772  }
   773  
   774  func (s *Float32MemoTable) WriteOutSubset(start int, out []byte) {
   775  	s.tbl.WriteOutSubset(start, out)
   776  }
   777  
   778  // Get returns the index of the requested value in the hash table or KeyNotFound
   779  // along with a boolean indicating if it was found or not.
   780  func (s *Float32MemoTable) Get(val interface{}) (int, bool) {
   781  	var cmp func(float32) bool
   782  
   783  	if math.IsNaN(float64(val.(float32))) {
   784  		cmp = isNan32Cmp
   785  		// use consistent internal bit pattern for NaN regardless of the pattern
   786  		// that is passed to us. NaN is NaN is NaN
   787  		val = float32(math.NaN())
   788  	} else {
   789  		cmp = func(v float32) bool { return val.(float32) == v }
   790  	}
   791  
   792  	h := hashFloat32(val.(float32), 0)
   793  	if e, ok := s.tbl.Lookup(h, cmp); ok {
   794  		return int(e.payload.memoIdx), ok
   795  	}
   796  	return KeyNotFound, false
   797  }
   798  
   799  // GetOrInsert will return the index of the specified value in the table, or insert the
   800  // value into the table and return the new index. found indicates whether or not it already
   801  // existed in the table (true) or was inserted by this call (false).
   802  func (s *Float32MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   803  
   804  	var cmp func(float32) bool
   805  
   806  	if math.IsNaN(float64(val.(float32))) {
   807  		cmp = isNan32Cmp
   808  		// use consistent internal bit pattern for NaN regardless of the pattern
   809  		// that is passed to us. NaN is NaN is NaN
   810  		val = float32(math.NaN())
   811  	} else {
   812  		cmp = func(v float32) bool { return val.(float32) == v }
   813  	}
   814  
   815  	h := hashFloat32(val.(float32), 0)
   816  	e, ok := s.tbl.Lookup(h, cmp)
   817  
   818  	if ok {
   819  		idx = int(e.payload.memoIdx)
   820  		found = true
   821  	} else {
   822  		idx = s.Size()
   823  		s.tbl.Insert(e, h, val.(float32), int32(idx))
   824  	}
   825  	return
   826  }
   827  
   828  type payloadFloat64 struct {
   829  	val     float64
   830  	memoIdx int32
   831  }
   832  
   833  type entryFloat64 struct {
   834  	h       uint64
   835  	payload payloadFloat64
   836  }
   837  
   838  func (e entryFloat64) Valid() bool { return e.h != sentinel }
   839  
   840  // Float64HashTable is a hashtable specifically for float64 that
   841  // is utilized with the MemoTable to generalize interactions for easier
   842  // implementation of dictionaries without losing performance.
   843  type Float64HashTable struct {
   844  	cap     uint64
   845  	capMask uint64
   846  	size    uint64
   847  
   848  	entries []entryFloat64
   849  }
   850  
   851  // NewFloat64HashTable returns a new hash table for float64 values
   852  // initialized with the passed in capacity or 32 whichever is larger.
   853  func NewFloat64HashTable(cap uint64) *Float64HashTable {
   854  	initCap := uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
   855  	ret := &Float64HashTable{cap: initCap, capMask: initCap - 1, size: 0}
   856  	ret.entries = make([]entryFloat64, initCap)
   857  	return ret
   858  }
   859  
   860  // Reset drops all of the values in this hash table and re-initializes it
   861  // with the specified initial capacity as if by calling New, but without having
   862  // to reallocate the object.
   863  func (h *Float64HashTable) Reset(cap uint64) {
   864  	h.cap = uint64(bitutil.NextPowerOf2(int(max(cap, 32))))
   865  	h.capMask = h.cap - 1
   866  	h.size = 0
   867  	h.entries = make([]entryFloat64, h.cap)
   868  }
   869  
   870  // CopyValues is used for copying the values out of the hash table into the
   871  // passed in slice, in the order that they were first inserted
   872  func (h *Float64HashTable) CopyValues(out []float64) {
   873  	h.CopyValuesSubset(0, out)
   874  }
   875  
   876  // CopyValuesSubset copies a subset of the values in the hashtable out, starting
   877  // with the value at start, in the order that they were inserted.
   878  func (h *Float64HashTable) CopyValuesSubset(start int, out []float64) {
   879  	h.VisitEntries(func(e *entryFloat64) {
   880  		idx := e.payload.memoIdx - int32(start)
   881  		if idx >= 0 {
   882  			out[idx] = e.payload.val
   883  		}
   884  	})
   885  }
   886  
   887  func (h *Float64HashTable) WriteOut(out []byte) {
   888  	h.WriteOutSubset(0, out)
   889  }
   890  
   891  func (h *Float64HashTable) WriteOutSubset(start int, out []byte) {
   892  	data := arrow.Float64Traits.CastFromBytes(out)
   893  	h.VisitEntries(func(e *entryFloat64) {
   894  		idx := e.payload.memoIdx - int32(start)
   895  		if idx >= 0 {
   896  			data[idx] = utils.ToLEFloat64(e.payload.val)
   897  		}
   898  	})
   899  }
   900  
   901  func (h *Float64HashTable) needUpsize() bool { return h.size*uint64(loadFactor) >= h.cap }
   902  
   903  func (Float64HashTable) fixHash(v uint64) uint64 {
   904  	if v == sentinel {
   905  		return 42
   906  	}
   907  	return v
   908  }
   909  
   910  // Lookup retrieves the entry for a given hash value assuming it's payload value returns
   911  // true when passed to the cmp func. Returns a pointer to the entry for the given hash value,
   912  // and a boolean as to whether it was found. It is not safe to use the pointer if the bool is false.
   913  func (h *Float64HashTable) Lookup(v uint64, cmp func(float64) bool) (*entryFloat64, bool) {
   914  	idx, ok := h.lookup(v, h.capMask, cmp)
   915  	return &h.entries[idx], ok
   916  }
   917  
   918  func (h *Float64HashTable) lookup(v uint64, szMask uint64, cmp func(float64) bool) (uint64, bool) {
   919  	const perturbShift uint8 = 5
   920  
   921  	var (
   922  		idx     uint64
   923  		perturb uint64
   924  		e       *entryFloat64
   925  	)
   926  
   927  	v = h.fixHash(v)
   928  	idx = v & szMask
   929  	perturb = (v >> uint64(perturbShift)) + 1
   930  
   931  	for {
   932  		e = &h.entries[idx]
   933  		if e.h == v && cmp(e.payload.val) {
   934  			return idx, true
   935  		}
   936  
   937  		if e.h == sentinel {
   938  			return idx, false
   939  		}
   940  
   941  		// perturbation logic inspired from CPython's set/dict object
   942  		// the goal is that all 64 bits of unmasked hash value eventually
   943  		// participate int he probing sequence, to minimize clustering
   944  		idx = (idx + perturb) & szMask
   945  		perturb = (perturb >> uint64(perturbShift)) + 1
   946  	}
   947  }
   948  
   949  func (h *Float64HashTable) upsize(newcap uint64) error {
   950  	newMask := newcap - 1
   951  
   952  	oldEntries := h.entries
   953  	h.entries = make([]entryFloat64, newcap)
   954  	for _, e := range oldEntries {
   955  		if e.Valid() {
   956  			idx, _ := h.lookup(e.h, newMask, func(float64) bool { return false })
   957  			h.entries[idx] = e
   958  		}
   959  	}
   960  	h.cap = newcap
   961  	h.capMask = newMask
   962  	return nil
   963  }
   964  
   965  // Insert updates the given entry with the provided hash value, payload value and memo index.
   966  // The entry pointer must have been retrieved via lookup in order to actually insert properly.
   967  func (h *Float64HashTable) Insert(e *entryFloat64, v uint64, val float64, memoIdx int32) error {
   968  	e.h = h.fixHash(v)
   969  	e.payload.val = val
   970  	e.payload.memoIdx = memoIdx
   971  	h.size++
   972  
   973  	if h.needUpsize() {
   974  		h.upsize(h.cap * uint64(loadFactor) * 2)
   975  	}
   976  	return nil
   977  }
   978  
   979  // VisitEntries will call the passed in function on each *valid* entry in the hash table,
   980  // a valid entry being one which has had a value inserted into it.
   981  func (h *Float64HashTable) VisitEntries(visit func(*entryFloat64)) {
   982  	for _, e := range h.entries {
   983  		if e.Valid() {
   984  			visit(&e)
   985  		}
   986  	}
   987  }
   988  
   989  // Float64MemoTable is a wrapper over the appropriate hashtable to provide an interface
   990  // conforming to the MemoTable interface defined in the encoding package for general interactions
   991  // regarding dictionaries.
   992  type Float64MemoTable struct {
   993  	tbl     *Float64HashTable
   994  	nullIdx int32
   995  }
   996  
   997  // NewFloat64MemoTable returns a new memotable with num entries pre-allocated to reduce further
   998  // allocations when inserting.
   999  func NewFloat64MemoTable(num int64) *Float64MemoTable {
  1000  	return &Float64MemoTable{tbl: NewFloat64HashTable(uint64(num)), nullIdx: KeyNotFound}
  1001  }
  1002  
  1003  // Reset allows this table to be re-used by dumping all the data currently in the table.
  1004  func (s *Float64MemoTable) Reset() {
  1005  	s.tbl.Reset(32)
  1006  	s.nullIdx = KeyNotFound
  1007  }
  1008  
  1009  // Size returns the current number of inserted elements into the table including if a null
  1010  // has been inserted.
  1011  func (s *Float64MemoTable) Size() int {
  1012  	sz := int(s.tbl.size)
  1013  	if _, ok := s.GetNull(); ok {
  1014  		sz++
  1015  	}
  1016  	return sz
  1017  }
  1018  
  1019  // GetNull returns the index of an inserted null or KeyNotFound along with a bool
  1020  // that will be true if found and false if not.
  1021  func (s *Float64MemoTable) GetNull() (int, bool) {
  1022  	return int(s.nullIdx), s.nullIdx != KeyNotFound
  1023  }
  1024  
  1025  // GetOrInsertNull will return the index of the null entry or insert a null entry
  1026  // if one currently doesn't exist. The found value will be true if there was already
  1027  // a null in the table, and false if it inserted one.
  1028  func (s *Float64MemoTable) GetOrInsertNull() (idx int, found bool) {
  1029  	idx, found = s.GetNull()
  1030  	if !found {
  1031  		idx = s.Size()
  1032  		s.nullIdx = int32(idx)
  1033  	}
  1034  	return
  1035  }
  1036  
  1037  // CopyValues will copy the values from the memo table out into the passed in slice
  1038  // which must be of the appropriate type.
  1039  func (s *Float64MemoTable) CopyValues(out interface{}) {
  1040  	s.CopyValuesSubset(0, out)
  1041  }
  1042  
  1043  // CopyValuesSubset is like CopyValues but only copies a subset of values starting
  1044  // at the provided start index
  1045  func (s *Float64MemoTable) CopyValuesSubset(start int, out interface{}) {
  1046  	s.tbl.CopyValuesSubset(start, out.([]float64))
  1047  }
  1048  
  1049  func (s *Float64MemoTable) WriteOut(out []byte) {
  1050  	s.tbl.WriteOut(out)
  1051  }
  1052  
  1053  func (s *Float64MemoTable) WriteOutSubset(start int, out []byte) {
  1054  	s.tbl.WriteOutSubset(start, out)
  1055  }
  1056  
  1057  // Get returns the index of the requested value in the hash table or KeyNotFound
  1058  // along with a boolean indicating if it was found or not.
  1059  func (s *Float64MemoTable) Get(val interface{}) (int, bool) {
  1060  	var cmp func(float64) bool
  1061  	if math.IsNaN(val.(float64)) {
  1062  		cmp = math.IsNaN
  1063  		// use consistent internal bit pattern for NaN regardless of the pattern
  1064  		// that is passed to us. NaN is NaN is NaN
  1065  		val = math.NaN()
  1066  	} else {
  1067  		cmp = func(v float64) bool { return val.(float64) == v }
  1068  	}
  1069  
  1070  	h := hashFloat64(val.(float64), 0)
  1071  	if e, ok := s.tbl.Lookup(h, cmp); ok {
  1072  		return int(e.payload.memoIdx), ok
  1073  	}
  1074  	return KeyNotFound, false
  1075  }
  1076  
  1077  // GetOrInsert will return the index of the specified value in the table, or insert the
  1078  // value into the table and return the new index. found indicates whether or not it already
  1079  // existed in the table (true) or was inserted by this call (false).
  1080  func (s *Float64MemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
  1081  
  1082  	var cmp func(float64) bool
  1083  	if math.IsNaN(val.(float64)) {
  1084  		cmp = math.IsNaN
  1085  		// use consistent internal bit pattern for NaN regardless of the pattern
  1086  		// that is passed to us. NaN is NaN is NaN
  1087  		val = math.NaN()
  1088  	} else {
  1089  		cmp = func(v float64) bool { return val.(float64) == v }
  1090  	}
  1091  
  1092  	h := hashFloat64(val.(float64), 0)
  1093  	e, ok := s.tbl.Lookup(h, cmp)
  1094  
  1095  	if ok {
  1096  		idx = int(e.payload.memoIdx)
  1097  		found = true
  1098  	} else {
  1099  		idx = s.Size()
  1100  		s.tbl.Insert(e, h, val.(float64), int32(idx))
  1101  	}
  1102  	return
  1103  }