github.com/apache/arrow/go/v16@v16.1.0/internal/hashing/xxh3_memo_table.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package hashing provides utilities for and an implementation of a hash
    18  // table which is more performant than the default go map implementation
    19  // by leveraging xxh3 and some custom hash functions.
    20  package hashing
    21  
    22  import (
    23  	"bytes"
    24  	"math"
    25  	"unsafe"
    26  )
    27  
    28  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
    29  
    30  type TypeTraits interface {
    31  	BytesRequired(n int) int
    32  }
    33  
    34  type ByteSlice interface {
    35  	Bytes() []byte
    36  }
    37  
    38  // MemoTable interface for hash tables and dictionary encoding.
    39  //
    40  // Values will remember the order they are inserted to generate a valid
    41  // dictionary.
    42  type MemoTable interface {
    43  	TypeTraits() TypeTraits
    44  	// Reset drops everything in the table allowing it to be reused
    45  	Reset()
    46  	// Size returns the current number of unique values stored in
    47  	// the table, including whether or not a null value has been
    48  	// inserted via GetOrInsertNull.
    49  	Size() int
    50  	// GetOrInsert returns the index of the table the specified value is,
    51  	// and a boolean indicating whether or not the value was found in
    52  	// the table (if false, the value was inserted). An error is returned
    53  	// if val is not the appropriate type for the table.
    54  	GetOrInsert(val interface{}) (idx int, existed bool, err error)
    55  	// GetOrInsertBytes returns the index of the table the specified value is,
    56  	// and a boolean indicating whether or not the value was found in
    57  	// the table (if false, the value was inserted). An error is returned
    58  	// if val is not the appropriate type for the table. This function is intended to be used by
    59  	// the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}.
    60  	GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
    61  	// GetOrInsertNull returns the index of the null value in the table,
    62  	// inserting one if it hasn't already been inserted. It returns a boolean
    63  	// indicating if the null value already existed or not in the table.
    64  	GetOrInsertNull() (idx int, existed bool)
    65  	// GetNull returns the index of the null value in the table, but does not
    66  	// insert one if it doesn't already exist. Will return -1 if it doesn't exist
    67  	// indicated by a false value for the boolean.
    68  	GetNull() (idx int, exists bool)
    69  	// WriteOut copies the unique values of the memotable out to the byte slice
    70  	// provided. Must have allocated enough bytes for all the values.
    71  	WriteOut(out []byte)
    72  	// WriteOutSubset is like WriteOut, but only writes a subset of values
    73  	// starting with the index offset.
    74  	WriteOutSubset(offset int, out []byte)
    75  }
    76  
    77  type NumericMemoTable interface {
    78  	MemoTable
    79  	WriteOutLE(out []byte)
    80  	WriteOutSubsetLE(offset int, out []byte)
    81  }
    82  
    83  const (
    84  	sentinel   uint64 = 0
    85  	loadFactor int64  = 2
    86  )
    87  
    88  func max(a, b uint64) uint64 {
    89  	if a > b {
    90  		return a
    91  	}
    92  	return b
    93  }
    94  
    95  var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
    96  
    97  // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
    98  const KeyNotFound = -1
    99  
   100  type BinaryBuilderIFace interface {
   101  	Reserve(int)
   102  	ReserveData(int)
   103  	Retain()
   104  	Resize(int)
   105  	ResizeData(int)
   106  	Release()
   107  	DataLen() int
   108  	Value(int) []byte
   109  	Len() int
   110  	AppendNull()
   111  	AppendString(string)
   112  	Append([]byte)
   113  }
   114  
   115  // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
   116  // to construct the actual data in an easy to pass around way with minimal copies
   117  // while using a hash table to keep track of the indexes into the dictionary that
   118  // is created as we go.
   119  type BinaryMemoTable struct {
   120  	tbl     *Int32HashTable
   121  	builder BinaryBuilderIFace
   122  	nullIdx int
   123  }
   124  
   125  // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
   126  // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
   127  // initial and valuesize can be used to pre-allocate the table to reduce allocations. With
   128  // initial being the initial number of entries to allocate for and valuesize being the starting
   129  // amount of space allocated for writing the actual binary data.
   130  func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable {
   131  	bldr.Reserve(int(initial))
   132  	datasize := valuesize
   133  	if datasize <= 0 {
   134  		datasize = initial * 4
   135  	}
   136  	bldr.ReserveData(datasize)
   137  	return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
   138  }
   139  
   140  type unimplementedtraits struct{}
   141  
   142  func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") }
   143  
   144  func (BinaryMemoTable) TypeTraits() TypeTraits {
   145  	return unimplementedtraits{}
   146  }
   147  
   148  // Reset dumps all of the data in the table allowing it to be reutilized.
   149  func (s *BinaryMemoTable) Reset() {
   150  	s.tbl.Reset(32)
   151  	s.builder.Resize(0)
   152  	s.builder.ResizeData(0)
   153  	s.builder.Reserve(int(32))
   154  	s.builder.ReserveData(int(32) * 4)
   155  	s.nullIdx = KeyNotFound
   156  }
   157  
   158  // GetNull returns the index of a null that has been inserted into the table or
   159  // KeyNotFound. The bool returned will be true if there was a null inserted into
   160  // the table, and false otherwise.
   161  func (s *BinaryMemoTable) GetNull() (int, bool) {
   162  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   163  }
   164  
   165  // Size returns the current size of the memo table including the null value
   166  // if one has been inserted.
   167  func (s *BinaryMemoTable) Size() int {
   168  	sz := int(s.tbl.size)
   169  	if _, ok := s.GetNull(); ok {
   170  		sz++
   171  	}
   172  	return sz
   173  }
   174  
   175  // helper function to easily return a byte slice for any given value
   176  // regardless of the type if it's a []byte, string, or fulfills the
   177  // ByteSlice interface.
   178  func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
   179  	switch v := val.(type) {
   180  	case []byte:
   181  		return v
   182  	case ByteSlice:
   183  		return v.Bytes()
   184  	case string:
   185  		return strToBytes(v)
   186  	default:
   187  		panic("invalid type for binarymemotable")
   188  	}
   189  }
   190  
   191  // helper function to get the hash value regardless of the underlying binary type
   192  func (BinaryMemoTable) getHash(val interface{}) uint64 {
   193  	switch v := val.(type) {
   194  	case string:
   195  		return hashString(v, 0)
   196  	case []byte:
   197  		return Hash(v, 0)
   198  	case ByteSlice:
   199  		return Hash(v.Bytes(), 0)
   200  	default:
   201  		panic("invalid type for binarymemotable")
   202  	}
   203  }
   204  
   205  // helper function to append the given value to the builder regardless
   206  // of the underlying binary type.
   207  func (b *BinaryMemoTable) appendVal(val interface{}) {
   208  	switch v := val.(type) {
   209  	case string:
   210  		b.builder.AppendString(v)
   211  	case []byte:
   212  		b.builder.Append(v)
   213  	case ByteSlice:
   214  		b.builder.Append(v.Bytes())
   215  	}
   216  }
   217  
   218  func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
   219  	return b.tbl.Lookup(h, func(i int32) bool {
   220  		return bytes.Equal(val, b.builder.Value(int(i)))
   221  	})
   222  }
   223  
   224  // Get returns the index of the specified value in the table or KeyNotFound,
   225  // and a boolean indicating whether it was found in the table.
   226  func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
   227  	if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
   228  		return int(p.payload.val), ok
   229  	}
   230  	return KeyNotFound, false
   231  }
   232  
   233  // GetOrInsertBytes returns the index of the given value in the table, if not found
   234  // it is inserted into the table. The return value 'found' indicates whether the value
   235  // was found in the table (true) or inserted (false) along with any possible error.
   236  func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
   237  	h := Hash(val, 0)
   238  	p, found := b.lookup(h, val)
   239  	if found {
   240  		idx = int(p.payload.val)
   241  	} else {
   242  		idx = b.Size()
   243  		b.builder.Append(val)
   244  		b.tbl.Insert(p, h, int32(idx), -1)
   245  	}
   246  	return
   247  }
   248  
   249  // GetOrInsert returns the index of the given value in the table, if not found
   250  // it is inserted into the table. The return value 'found' indicates whether the value
   251  // was found in the table (true) or inserted (false) along with any possible error.
   252  func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   253  	h := b.getHash(val)
   254  	p, found := b.lookup(h, b.valAsByteSlice(val))
   255  	if found {
   256  		idx = int(p.payload.val)
   257  	} else {
   258  		idx = b.Size()
   259  		b.appendVal(val)
   260  		b.tbl.Insert(p, h, int32(idx), -1)
   261  	}
   262  	return
   263  }
   264  
   265  // GetOrInsertNull retrieves the index of a null in the table or inserts
   266  // null into the table, returning the index and a boolean indicating if it was
   267  // found in the table (true) or was inserted (false).
   268  func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
   269  	idx, found = b.GetNull()
   270  	if !found {
   271  		idx = b.Size()
   272  		b.nullIdx = idx
   273  		b.builder.AppendNull()
   274  	}
   275  	return
   276  }
   277  
   278  func (b *BinaryMemoTable) Value(i int) []byte {
   279  	return b.builder.Value(i)
   280  }
   281  
   282  // helper function to get the offset into the builder data for a given
   283  // index value.
   284  func (b *BinaryMemoTable) findOffset(idx int) uintptr {
   285  	if b.builder.DataLen() == 0 {
   286  		// only empty strings, short circuit
   287  		return 0
   288  	}
   289  
   290  	val := b.builder.Value(idx)
   291  	for len(val) == 0 {
   292  		idx++
   293  		if idx >= b.builder.Len() {
   294  			break
   295  		}
   296  		val = b.builder.Value(idx)
   297  	}
   298  	if len(val) != 0 {
   299  		return uintptr(unsafe.Pointer(&val[0]))
   300  	}
   301  	return uintptr(b.builder.DataLen()) + b.findOffset(0)
   302  }
   303  
   304  // CopyOffsets copies the list of offsets into the passed in slice, the offsets
   305  // being the start and end values of the underlying allocated bytes in the builder
   306  // for the individual values of the table. out should be at least sized to Size()+1
   307  func (b *BinaryMemoTable) CopyOffsets(out []int32) {
   308  	b.CopyOffsetsSubset(0, out)
   309  }
   310  
   311  // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
   312  // it gets a subset of the offsets in the table starting at the index provided by "start".
   313  func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) {
   314  	if b.builder.Len() <= start {
   315  		return
   316  	}
   317  
   318  	first := b.findOffset(0)
   319  	delta := b.findOffset(start)
   320  	sz := b.Size()
   321  	for i := start; i < sz; i++ {
   322  		offset := int32(b.findOffset(i) - delta)
   323  		out[i-start] = offset
   324  	}
   325  
   326  	out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first)))
   327  }
   328  
   329  // CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets
   330  // being the start and end values of the underlying allocated bytes in the builder
   331  // for the individual values of the table. out should be at least sized to Size()+1
   332  func (b *BinaryMemoTable) CopyLargeOffsets(out []int64) {
   333  	b.CopyLargeOffsetsSubset(0, out)
   334  }
   335  
   336  // CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
   337  // it gets a subset of the offsets in the table starting at the index provided by "start".
   338  func (b *BinaryMemoTable) CopyLargeOffsetsSubset(start int, out []int64) {
   339  	if b.builder.Len() <= start {
   340  		return
   341  	}
   342  
   343  	first := b.findOffset(0)
   344  	delta := b.findOffset(start)
   345  	sz := b.Size()
   346  	for i := start; i < sz; i++ {
   347  		offset := int64(b.findOffset(i) - delta)
   348  		out[i-start] = offset
   349  	}
   350  
   351  	out[sz-start] = int64(b.builder.DataLen() - (int(delta) - int(first)))
   352  }
   353  
   354  // CopyValues copies the raw binary data bytes out, out should be a []byte
   355  // with at least ValuesSize bytes allocated to copy into.
   356  func (b *BinaryMemoTable) CopyValues(out interface{}) {
   357  	b.CopyValuesSubset(0, out)
   358  }
   359  
   360  // CopyValuesSubset copies the raw binary data bytes out starting with the value
   361  // at the index start, out should be a []byte with at least ValuesSize bytes allocated
   362  func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
   363  	if b.builder.Len() <= start {
   364  		return
   365  	}
   366  
   367  	var (
   368  		first  = b.findOffset(0)
   369  		offset = b.findOffset(int(start))
   370  		length = b.builder.DataLen() - int(offset-first)
   371  	)
   372  
   373  	outval := out.([]byte)
   374  	copy(outval, b.builder.Value(start)[0:length])
   375  }
   376  
   377  func (b *BinaryMemoTable) WriteOut(out []byte) {
   378  	b.CopyValues(out)
   379  }
   380  
   381  func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
   382  	b.CopyValuesSubset(start, out)
   383  }
   384  
   385  // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
   386  // track of the fixed width when inserting the null value the databuffer holds a
   387  // zero length byte slice for the null value (if found)
   388  func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
   389  	if start >= b.Size() {
   390  		return
   391  	}
   392  
   393  	null, exists := b.GetNull()
   394  	if !exists || null < start {
   395  		// nothing to skip, proceed as usual
   396  		b.CopyValuesSubset(start, out)
   397  		return
   398  	}
   399  
   400  	var (
   401  		leftOffset  = b.findOffset(start)
   402  		nullOffset  = b.findOffset(null)
   403  		leftSize    = nullOffset - leftOffset
   404  		rightOffset = leftOffset + uintptr(b.ValuesSize())
   405  	)
   406  
   407  	if leftSize > 0 {
   408  		copy(out, b.builder.Value(start)[0:leftSize])
   409  	}
   410  
   411  	rightSize := rightOffset - nullOffset
   412  	if rightSize > 0 {
   413  		// skip the null fixed size value
   414  		copy(out[int(leftSize)+width:], b.builder.Value(null + 1)[0:rightSize])
   415  	}
   416  }
   417  
   418  // VisitValues exists to run the visitFn on each value currently in the hash table.
   419  func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
   420  	for i := int(start); i < b.Size(); i++ {
   421  		visitFn(b.builder.Value(i))
   422  	}
   423  }
   424  
   425  // Release is used to tell the underlying builder that it can release the memory allocated
   426  // when the reference count reaches 0, this is safe to be called from multiple goroutines
   427  // simultaneously
   428  func (b *BinaryMemoTable) Release() { b.builder.Release() }
   429  
   430  // Retain increases the ref count, it is safe to call it from multiple goroutines
   431  // simultaneously.
   432  func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
   433  
   434  // ValuesSize returns the current total size of all the raw bytes that have been inserted
   435  // into the memotable so far.
   436  func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }