github.com/apache/arrow/go/v10@v10.0.1/internal/hashing/xxh3_memo_table.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package hashing provides utilities for and an implementation of a hash
    18  // table which is more performant than the default go map implementation
    19  // by leveraging xxh3 and some custom hash functions.
    20  package hashing
    21  
    22  import (
    23  	"bytes"
    24  	"math"
    25  	"math/bits"
    26  	"reflect"
    27  	"unsafe"
    28  
    29  	"github.com/apache/arrow/go/v10/parquet"
    30  
    31  	"github.com/zeebo/xxh3"
    32  )
    33  
    34  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
    35  
    36  type TypeTraits interface {
    37  	BytesRequired(n int) int
    38  }
    39  
    40  // MemoTable interface for hash tables and dictionary encoding.
    41  //
    42  // Values will remember the order they are inserted to generate a valid
    43  // dictionary.
    44  type MemoTable interface {
    45  	TypeTraits() TypeTraits
    46  	// Reset drops everything in the table allowing it to be reused
    47  	Reset()
    48  	// Size returns the current number of unique values stored in
    49  	// the table, including whether or not a null value has been
    50  	// inserted via GetOrInsertNull.
    51  	Size() int
    52  	// GetOrInsert returns the index of the table the specified value is,
    53  	// and a boolean indicating whether or not the value was found in
    54  	// the table (if false, the value was inserted). An error is returned
    55  	// if val is not the appropriate type for the table.
    56  	GetOrInsert(val interface{}) (idx int, existed bool, err error)
    57  	// GetOrInsertNull returns the index of the null value in the table,
    58  	// inserting one if it hasn't already been inserted. It returns a boolean
    59  	// indicating if the null value already existed or not in the table.
    60  	GetOrInsertNull() (idx int, existed bool)
    61  	// GetNull returns the index of the null value in the table, but does not
    62  	// insert one if it doesn't already exist. Will return -1 if it doesn't exist
    63  	// indicated by a false value for the boolean.
    64  	GetNull() (idx int, exists bool)
    65  	// WriteOut copys the unique values of the memotable out to the byte slice
    66  	// provided. Must have allocated enough bytes for all the values.
    67  	WriteOut(out []byte)
    68  	// WriteOutSubset is like WriteOut, but only writes a subset of values
    69  	// starting with the index offset.
    70  	WriteOutSubset(offset int, out []byte)
    71  }
    72  
    73  type NumericMemoTable interface {
    74  	MemoTable
    75  	WriteOutLE(out []byte)
    76  	WriteOutSubsetLE(offset int, out []byte)
    77  }
    78  
    79  func hashInt(val uint64, alg uint64) uint64 {
    80  	// Two of xxhash's prime multipliers (which are chosen for their
    81  	// bit dispersion properties)
    82  	var multipliers = [2]uint64{11400714785074694791, 14029467366897019727}
    83  	// Multiplying by the prime number mixes the low bits into the high bits,
    84  	// then byte-swapping (which is a single CPU instruction) allows the
    85  	// combined high and low bits to participate in the initial hash table index.
    86  	return bits.ReverseBytes64(multipliers[alg] * val)
    87  }
    88  
    89  func hashFloat32(val float32, alg uint64) uint64 {
    90  	// grab the raw byte pattern of the
    91  	bt := *(*[4]byte)(unsafe.Pointer(&val))
    92  	x := uint64(*(*uint32)(unsafe.Pointer(&bt[0])))
    93  	hx := hashInt(x, alg)
    94  	hy := hashInt(x, alg^1)
    95  	return 4 ^ hx ^ hy
    96  }
    97  
    98  func hashFloat64(val float64, alg uint64) uint64 {
    99  	bt := *(*[8]byte)(unsafe.Pointer(&val))
   100  	hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg)
   101  	hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1)
   102  	return 8 ^ hx ^ hy
   103  }
   104  
   105  func hashString(val string, alg uint64) uint64 {
   106  	buf := *(*[]byte)(unsafe.Pointer(&val))
   107  	(*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val)
   108  	return hash(buf, alg)
   109  }
   110  
   111  // prime constants used for slightly increasing the hash quality further
   112  var exprimes = [2]uint64{1609587929392839161, 9650029242287828579}
   113  
   114  // for smaller amounts of bytes this is faster than even calling into
   115  // xxh3 to do the hash, so we specialize in order to get the benefits
   116  // of that performance.
   117  func hash(b []byte, alg uint64) uint64 {
   118  	n := uint32(len(b))
   119  	if n <= 16 {
   120  		switch {
   121  		case n > 8:
   122  			// 8 < length <= 16
   123  			// apply same principle as above, but as two 64-bit ints
   124  			x := *(*uint64)(unsafe.Pointer(&b[n-8]))
   125  			y := *(*uint64)(unsafe.Pointer(&b[0]))
   126  			hx := hashInt(x, alg)
   127  			hy := hashInt(y, alg^1)
   128  			return uint64(n) ^ hx ^ hy
   129  		case n >= 4:
   130  			// 4 < length <= 8
   131  			// we can read the bytes as two overlapping 32-bit ints, apply different
   132  			// hash functions to each in parallel
   133  			// then xor the results
   134  			x := *(*uint32)(unsafe.Pointer(&b[n-4]))
   135  			y := *(*uint32)(unsafe.Pointer(&b[0]))
   136  			hx := hashInt(uint64(x), alg)
   137  			hy := hashInt(uint64(y), alg^1)
   138  			return uint64(n) ^ hx ^ hy
   139  		case n > 0:
   140  			x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1]))
   141  			return hashInt(uint64(x), alg)
   142  		case n == 0:
   143  			return 1
   144  		}
   145  	}
   146  
   147  	// increase differentiation enough to improve hash quality
   148  	return xxh3.Hash(b) + exprimes[alg]
   149  }
   150  
   151  const (
   152  	sentinel   uint64 = 0
   153  	loadFactor int64  = 2
   154  )
   155  
   156  func max(a, b uint64) uint64 {
   157  	if a > b {
   158  		return a
   159  	}
   160  	return b
   161  }
   162  
   163  var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
   164  
   165  // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
   166  const KeyNotFound = -1
   167  
   168  type BinaryBuilderIFace interface {
   169  	Reserve(int)
   170  	ReserveData(int)
   171  	Retain()
   172  	Resize(int)
   173  	ResizeData(int)
   174  	Release()
   175  	DataLen() int
   176  	Value(int) []byte
   177  	Len() int
   178  	AppendNull()
   179  	AppendString(string)
   180  	Append([]byte)
   181  }
   182  
   183  // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
   184  // to construct the actual data in an easy to pass around way with minimal copies
   185  // while using a hash table to keep track of the indexes into the dictionary that
   186  // is created as we go.
   187  type BinaryMemoTable struct {
   188  	tbl     *Int32HashTable
   189  	builder BinaryBuilderIFace
   190  	nullIdx int
   191  }
   192  
   193  // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
   194  // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
   195  // initial and valuesize can be used to pre-allocate the table to reduce allocations. With
   196  // initial being the initial number of entries to allocate for and valuesize being the starting
   197  // amount of space allocated for writing the actual binary data.
   198  func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable {
   199  	bldr.Reserve(int(initial))
   200  	datasize := valuesize
   201  	if datasize <= 0 {
   202  		datasize = initial * 4
   203  	}
   204  	bldr.ReserveData(datasize)
   205  	return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
   206  }
   207  
   208  type unimplementedtraits struct{}
   209  
   210  func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") }
   211  
   212  func (BinaryMemoTable) TypeTraits() TypeTraits {
   213  	return unimplementedtraits{}
   214  }
   215  
   216  // Reset dumps all of the data in the table allowing it to be reutilized.
   217  func (s *BinaryMemoTable) Reset() {
   218  	s.tbl.Reset(32)
   219  	s.builder.Resize(0)
   220  	s.builder.ResizeData(0)
   221  	s.builder.Reserve(int(32))
   222  	s.builder.ReserveData(int(32) * 4)
   223  	s.nullIdx = KeyNotFound
   224  }
   225  
   226  // GetNull returns the index of a null that has been inserted into the table or
   227  // KeyNotFound. The bool returned will be true if there was a null inserted into
   228  // the table, and false otherwise.
   229  func (s *BinaryMemoTable) GetNull() (int, bool) {
   230  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   231  }
   232  
   233  // Size returns the current size of the memo table including the null value
   234  // if one has been inserted.
   235  func (s *BinaryMemoTable) Size() int {
   236  	sz := int(s.tbl.size)
   237  	if _, ok := s.GetNull(); ok {
   238  		sz++
   239  	}
   240  	return sz
   241  }
   242  
   243  // helper function to easily return a byte slice for any given value
   244  // regardless of the type if it's a []byte, parquet.ByteArray,
   245  // parquet.FixedLenByteArray or string.
   246  func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
   247  	switch v := val.(type) {
   248  	case []byte:
   249  		return v
   250  	case parquet.ByteArray:
   251  		return *(*[]byte)(unsafe.Pointer(&v))
   252  	case parquet.FixedLenByteArray:
   253  		return *(*[]byte)(unsafe.Pointer(&v))
   254  	case string:
   255  		var out []byte
   256  		h := (*reflect.StringHeader)(unsafe.Pointer(&v))
   257  		s := (*reflect.SliceHeader)(unsafe.Pointer(&out))
   258  		s.Data = h.Data
   259  		s.Len = h.Len
   260  		s.Cap = h.Len
   261  		return out
   262  	default:
   263  		panic("invalid type for binarymemotable")
   264  	}
   265  }
   266  
   267  // helper function to get the hash value regardless of the underlying binary type
   268  func (BinaryMemoTable) getHash(val interface{}) uint64 {
   269  	switch v := val.(type) {
   270  	case string:
   271  		return hashString(v, 0)
   272  	case []byte:
   273  		return hash(v, 0)
   274  	case parquet.ByteArray:
   275  		return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
   276  	case parquet.FixedLenByteArray:
   277  		return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
   278  	default:
   279  		panic("invalid type for binarymemotable")
   280  	}
   281  }
   282  
   283  // helper function to append the given value to the builder regardless
   284  // of the underlying binary type.
   285  func (b *BinaryMemoTable) appendVal(val interface{}) {
   286  	switch v := val.(type) {
   287  	case string:
   288  		b.builder.AppendString(v)
   289  	case []byte:
   290  		b.builder.Append(v)
   291  	case parquet.ByteArray:
   292  		b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
   293  	case parquet.FixedLenByteArray:
   294  		b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
   295  	}
   296  }
   297  
   298  func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
   299  	return b.tbl.Lookup(h, func(i int32) bool {
   300  		return bytes.Equal(val, b.builder.Value(int(i)))
   301  	})
   302  }
   303  
   304  // Get returns the index of the specified value in the table or KeyNotFound,
   305  // and a boolean indicating whether it was found in the table.
   306  func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
   307  	if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
   308  		return int(p.payload.val), ok
   309  	}
   310  	return KeyNotFound, false
   311  }
   312  
   313  // GetOrInsert returns the index of the given value in the table, if not found
   314  // it is inserted into the table. The return value 'found' indicates whether the value
   315  // was found in the table (true) or inserted (false) along with any possible error.
   316  func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   317  	h := b.getHash(val)
   318  	p, found := b.lookup(h, b.valAsByteSlice(val))
   319  	if found {
   320  		idx = int(p.payload.val)
   321  	} else {
   322  		idx = b.Size()
   323  		b.appendVal(val)
   324  		b.tbl.Insert(p, h, int32(idx), -1)
   325  	}
   326  	return
   327  }
   328  
   329  // GetOrInsertNull retrieves the index of a null in the table or inserts
   330  // null into the table, returning the index and a boolean indicating if it was
   331  // found in the table (true) or was inserted (false).
   332  func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
   333  	idx, found = b.GetNull()
   334  	if !found {
   335  		idx = b.Size()
   336  		b.nullIdx = idx
   337  		b.builder.AppendNull()
   338  	}
   339  	return
   340  }
   341  
   342  // helper function to get the offset into the builder data for a given
   343  // index value.
   344  func (b *BinaryMemoTable) findOffset(idx int) uintptr {
   345  	val := b.builder.Value(idx)
   346  	for len(val) == 0 {
   347  		idx++
   348  		if idx >= b.builder.Len() {
   349  			break
   350  		}
   351  		val = b.builder.Value(idx)
   352  	}
   353  	if len(val) != 0 {
   354  		return uintptr(unsafe.Pointer(&val[0]))
   355  	}
   356  	return uintptr(b.builder.DataLen()) + b.findOffset(0)
   357  }
   358  
   359  // CopyOffsets copies the list of offsets into the passed in slice, the offsets
   360  // being the start and end values of the underlying allocated bytes in the builder
   361  // for the individual values of the table. out should be at least sized to Size()+1
   362  func (b *BinaryMemoTable) CopyOffsets(out []int32) {
   363  	b.CopyOffsetsSubset(0, out)
   364  }
   365  
   366  // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
   367  // it gets a subset of the offsets in the table starting at the index provided by "start".
   368  func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) {
   369  	if b.builder.Len() <= start {
   370  		return
   371  	}
   372  
   373  	first := b.findOffset(0)
   374  	delta := b.findOffset(start)
   375  	sz := b.Size()
   376  	for i := start; i < sz; i++ {
   377  		offset := int32(b.findOffset(i) - delta)
   378  		out[i-start] = offset
   379  	}
   380  
   381  	out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first)))
   382  }
   383  
   384  // CopyValues copies the raw binary data bytes out, out should be a []byte
   385  // with at least ValuesSize bytes allocated to copy into.
   386  func (b *BinaryMemoTable) CopyValues(out interface{}) {
   387  	b.CopyValuesSubset(0, out)
   388  }
   389  
   390  // CopyValuesSubset copies the raw binary data bytes out starting with the value
   391  // at the index start, out should be a []byte with at least ValuesSize bytes allocated
   392  func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
   393  	if b.builder.Len() <= start {
   394  		return
   395  	}
   396  
   397  	var (
   398  		first  = b.findOffset(0)
   399  		offset = b.findOffset(int(start))
   400  		length = b.builder.DataLen() - int(offset-first)
   401  	)
   402  
   403  	outval := out.([]byte)
   404  	copy(outval, b.builder.Value(start)[0:length])
   405  }
   406  
   407  func (b *BinaryMemoTable) WriteOut(out []byte) {
   408  	b.CopyValues(out)
   409  }
   410  
   411  func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
   412  	b.CopyValuesSubset(start, out)
   413  }
   414  
   415  // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
   416  // track of the fixed width when inserting the null value the databuffer holds a
   417  // zero length byte slice for the null value (if found)
   418  func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
   419  	if start >= b.Size() {
   420  		return
   421  	}
   422  
   423  	null, exists := b.GetNull()
   424  	if !exists || null < start {
   425  		// nothing to skip, proceed as usual
   426  		b.CopyValuesSubset(start, out)
   427  		return
   428  	}
   429  
   430  	var (
   431  		leftOffset = b.findOffset(start)
   432  		nullOffset = b.findOffset(null)
   433  		leftSize   = nullOffset - leftOffset
   434  	)
   435  
   436  	if leftSize > 0 {
   437  		copy(out, b.builder.Value(start)[0:leftSize])
   438  	}
   439  
   440  	rightSize := b.ValuesSize() - int(nullOffset)
   441  	if rightSize > 0 {
   442  		// skip the null fixed size value
   443  		copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize])
   444  	}
   445  }
   446  
   447  // VisitValues exists to run the visitFn on each value currently in the hash table.
   448  func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
   449  	for i := int(start); i < b.Size(); i++ {
   450  		visitFn(b.builder.Value(i))
   451  	}
   452  }
   453  
   454  // Release is used to tell the underlying builder that it can release the memory allocated
   455  // when the reference count reaches 0, this is safe to be called from multiple goroutines
   456  // simultaneously
   457  func (b *BinaryMemoTable) Release() { b.builder.Release() }
   458  
   459  // Retain increases the ref count, it is safe to call it from multiple goroutines
   460  // simultaneously.
   461  func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
   462  
   463  // ValuesSize returns the current total size of all the raw bytes that have been inserted
   464  // into the memotable so far.
   465  func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }