github.com/apache/arrow/go/v14@v14.0.2/internal/hashing/xxh3_memo_table.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package hashing provides utilities for and an implementation of a hash
    18  // table which is more performant than the default go map implementation
    19  // by leveraging xxh3 and some custom hash functions.
    20  package hashing
    21  
    22  import (
    23  	"bytes"
    24  	"math"
    25  	"reflect"
    26  	"unsafe"
    27  )
    28  
    29  //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
    30  
    31  type TypeTraits interface {
    32  	BytesRequired(n int) int
    33  }
    34  
    35  type ByteSlice interface {
    36  	Bytes() []byte
    37  }
    38  
    39  // MemoTable interface for hash tables and dictionary encoding.
    40  //
    41  // Values will remember the order they are inserted to generate a valid
    42  // dictionary.
    43  type MemoTable interface {
    44  	TypeTraits() TypeTraits
    45  	// Reset drops everything in the table allowing it to be reused
    46  	Reset()
    47  	// Size returns the current number of unique values stored in
    48  	// the table, including whether or not a null value has been
    49  	// inserted via GetOrInsertNull.
    50  	Size() int
    51  	// GetOrInsert returns the index of the table the specified value is,
    52  	// and a boolean indicating whether or not the value was found in
    53  	// the table (if false, the value was inserted). An error is returned
    54  	// if val is not the appropriate type for the table.
    55  	GetOrInsert(val interface{}) (idx int, existed bool, err error)
    56  	// GetOrInsertBytes returns the index of the table the specified value is,
    57  	// and a boolean indicating whether or not the value was found in
    58  	// the table (if false, the value was inserted). An error is returned
    59  	// if val is not the appropriate type for the table. This function is intended to be used by
    60  	// the BinaryMemoTable to prevent uncessary allocations of the data when converting from a []byte to interface{}.
    61  	GetOrInsertBytes(val []byte) (idx int, existed bool, err error)
    62  	// GetOrInsertNull returns the index of the null value in the table,
    63  	// inserting one if it hasn't already been inserted. It returns a boolean
    64  	// indicating if the null value already existed or not in the table.
    65  	GetOrInsertNull() (idx int, existed bool)
    66  	// GetNull returns the index of the null value in the table, but does not
    67  	// insert one if it doesn't already exist. Will return -1 if it doesn't exist
    68  	// indicated by a false value for the boolean.
    69  	GetNull() (idx int, exists bool)
    70  	// WriteOut copys the unique values of the memotable out to the byte slice
    71  	// provided. Must have allocated enough bytes for all the values.
    72  	WriteOut(out []byte)
    73  	// WriteOutSubset is like WriteOut, but only writes a subset of values
    74  	// starting with the index offset.
    75  	WriteOutSubset(offset int, out []byte)
    76  }
    77  
    78  type NumericMemoTable interface {
    79  	MemoTable
    80  	WriteOutLE(out []byte)
    81  	WriteOutSubsetLE(offset int, out []byte)
    82  }
    83  
    84  const (
    85  	sentinel   uint64 = 0
    86  	loadFactor int64  = 2
    87  )
    88  
    89  func max(a, b uint64) uint64 {
    90  	if a > b {
    91  		return a
    92  	}
    93  	return b
    94  }
    95  
    96  var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
    97  
    98  // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
    99  const KeyNotFound = -1
   100  
   101  type BinaryBuilderIFace interface {
   102  	Reserve(int)
   103  	ReserveData(int)
   104  	Retain()
   105  	Resize(int)
   106  	ResizeData(int)
   107  	Release()
   108  	DataLen() int
   109  	Value(int) []byte
   110  	Len() int
   111  	AppendNull()
   112  	AppendString(string)
   113  	Append([]byte)
   114  }
   115  
   116  // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
   117  // to construct the actual data in an easy to pass around way with minimal copies
   118  // while using a hash table to keep track of the indexes into the dictionary that
   119  // is created as we go.
   120  type BinaryMemoTable struct {
   121  	tbl     *Int32HashTable
   122  	builder BinaryBuilderIFace
   123  	nullIdx int
   124  }
   125  
   126  // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
   127  // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
   128  // initial and valuesize can be used to pre-allocate the table to reduce allocations. With
   129  // initial being the initial number of entries to allocate for and valuesize being the starting
   130  // amount of space allocated for writing the actual binary data.
   131  func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable {
   132  	bldr.Reserve(int(initial))
   133  	datasize := valuesize
   134  	if datasize <= 0 {
   135  		datasize = initial * 4
   136  	}
   137  	bldr.ReserveData(datasize)
   138  	return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
   139  }
   140  
   141  type unimplementedtraits struct{}
   142  
   143  func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") }
   144  
   145  func (BinaryMemoTable) TypeTraits() TypeTraits {
   146  	return unimplementedtraits{}
   147  }
   148  
   149  // Reset dumps all of the data in the table allowing it to be reutilized.
   150  func (s *BinaryMemoTable) Reset() {
   151  	s.tbl.Reset(32)
   152  	s.builder.Resize(0)
   153  	s.builder.ResizeData(0)
   154  	s.builder.Reserve(int(32))
   155  	s.builder.ReserveData(int(32) * 4)
   156  	s.nullIdx = KeyNotFound
   157  }
   158  
   159  // GetNull returns the index of a null that has been inserted into the table or
   160  // KeyNotFound. The bool returned will be true if there was a null inserted into
   161  // the table, and false otherwise.
   162  func (s *BinaryMemoTable) GetNull() (int, bool) {
   163  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   164  }
   165  
   166  // Size returns the current size of the memo table including the null value
   167  // if one has been inserted.
   168  func (s *BinaryMemoTable) Size() int {
   169  	sz := int(s.tbl.size)
   170  	if _, ok := s.GetNull(); ok {
   171  		sz++
   172  	}
   173  	return sz
   174  }
   175  
   176  // helper function to easily return a byte slice for any given value
   177  // regardless of the type if it's a []byte, string, or fulfills the
   178  // ByteSlice interface.
   179  func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
   180  	switch v := val.(type) {
   181  	case []byte:
   182  		return v
   183  	case ByteSlice:
   184  		return v.Bytes()
   185  	case string:
   186  		var out []byte
   187  		h := (*reflect.StringHeader)(unsafe.Pointer(&v))
   188  		s := (*reflect.SliceHeader)(unsafe.Pointer(&out))
   189  		s.Data = h.Data
   190  		s.Len = h.Len
   191  		s.Cap = h.Len
   192  		return out
   193  	default:
   194  		panic("invalid type for binarymemotable")
   195  	}
   196  }
   197  
   198  // helper function to get the hash value regardless of the underlying binary type
   199  func (BinaryMemoTable) getHash(val interface{}) uint64 {
   200  	switch v := val.(type) {
   201  	case string:
   202  		return hashString(v, 0)
   203  	case []byte:
   204  		return Hash(v, 0)
   205  	case ByteSlice:
   206  		return Hash(v.Bytes(), 0)
   207  	default:
   208  		panic("invalid type for binarymemotable")
   209  	}
   210  }
   211  
   212  // helper function to append the given value to the builder regardless
   213  // of the underlying binary type.
   214  func (b *BinaryMemoTable) appendVal(val interface{}) {
   215  	switch v := val.(type) {
   216  	case string:
   217  		b.builder.AppendString(v)
   218  	case []byte:
   219  		b.builder.Append(v)
   220  	case ByteSlice:
   221  		b.builder.Append(v.Bytes())
   222  	}
   223  }
   224  
   225  func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
   226  	return b.tbl.Lookup(h, func(i int32) bool {
   227  		return bytes.Equal(val, b.builder.Value(int(i)))
   228  	})
   229  }
   230  
   231  // Get returns the index of the specified value in the table or KeyNotFound,
   232  // and a boolean indicating whether it was found in the table.
   233  func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
   234  	if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
   235  		return int(p.payload.val), ok
   236  	}
   237  	return KeyNotFound, false
   238  }
   239  
   240  // GetOrInsertBytes returns the index of the given value in the table, if not found
   241  // it is inserted into the table. The return value 'found' indicates whether the value
   242  // was found in the table (true) or inserted (false) along with any possible error.
   243  func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) {
   244  	h := Hash(val, 0)
   245  	p, found := b.lookup(h, val)
   246  	if found {
   247  		idx = int(p.payload.val)
   248  	} else {
   249  		idx = b.Size()
   250  		b.builder.Append(val)
   251  		b.tbl.Insert(p, h, int32(idx), -1)
   252  	}
   253  	return
   254  }
   255  
   256  // GetOrInsert returns the index of the given value in the table, if not found
   257  // it is inserted into the table. The return value 'found' indicates whether the value
   258  // was found in the table (true) or inserted (false) along with any possible error.
   259  func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   260  	h := b.getHash(val)
   261  	p, found := b.lookup(h, b.valAsByteSlice(val))
   262  	if found {
   263  		idx = int(p.payload.val)
   264  	} else {
   265  		idx = b.Size()
   266  		b.appendVal(val)
   267  		b.tbl.Insert(p, h, int32(idx), -1)
   268  	}
   269  	return
   270  }
   271  
   272  // GetOrInsertNull retrieves the index of a null in the table or inserts
   273  // null into the table, returning the index and a boolean indicating if it was
   274  // found in the table (true) or was inserted (false).
   275  func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
   276  	idx, found = b.GetNull()
   277  	if !found {
   278  		idx = b.Size()
   279  		b.nullIdx = idx
   280  		b.builder.AppendNull()
   281  	}
   282  	return
   283  }
   284  
   285  func (b *BinaryMemoTable) Value(i int) []byte {
   286  	return b.builder.Value(i)
   287  }
   288  
   289  // helper function to get the offset into the builder data for a given
   290  // index value.
   291  func (b *BinaryMemoTable) findOffset(idx int) uintptr {
   292  	if b.builder.DataLen() == 0 {
   293  		// only empty strings, short circuit
   294  		return 0
   295  	}
   296  
   297  	val := b.builder.Value(idx)
   298  	for len(val) == 0 {
   299  		idx++
   300  		if idx >= b.builder.Len() {
   301  			break
   302  		}
   303  		val = b.builder.Value(idx)
   304  	}
   305  	if len(val) != 0 {
   306  		return uintptr(unsafe.Pointer(&val[0]))
   307  	}
   308  	return uintptr(b.builder.DataLen()) + b.findOffset(0)
   309  }
   310  
   311  // CopyOffsets copies the list of offsets into the passed in slice, the offsets
   312  // being the start and end values of the underlying allocated bytes in the builder
   313  // for the individual values of the table. out should be at least sized to Size()+1
   314  func (b *BinaryMemoTable) CopyOffsets(out []int32) {
   315  	b.CopyOffsetsSubset(0, out)
   316  }
   317  
   318  // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
   319  // it gets a subset of the offsets in the table starting at the index provided by "start".
   320  func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) {
   321  	if b.builder.Len() <= start {
   322  		return
   323  	}
   324  
   325  	first := b.findOffset(0)
   326  	delta := b.findOffset(start)
   327  	sz := b.Size()
   328  	for i := start; i < sz; i++ {
   329  		offset := int32(b.findOffset(i) - delta)
   330  		out[i-start] = offset
   331  	}
   332  
   333  	out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first)))
   334  }
   335  
   336  // CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets
   337  // being the start and end values of the underlying allocated bytes in the builder
   338  // for the individual values of the table. out should be at least sized to Size()+1
   339  func (b *BinaryMemoTable) CopyLargeOffsets(out []int64) {
   340  	b.CopyLargeOffsetsSubset(0, out)
   341  }
   342  
   343  // CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
   344  // it gets a subset of the offsets in the table starting at the index provided by "start".
   345  func (b *BinaryMemoTable) CopyLargeOffsetsSubset(start int, out []int64) {
   346  	if b.builder.Len() <= start {
   347  		return
   348  	}
   349  
   350  	first := b.findOffset(0)
   351  	delta := b.findOffset(start)
   352  	sz := b.Size()
   353  	for i := start; i < sz; i++ {
   354  		offset := int64(b.findOffset(i) - delta)
   355  		out[i-start] = offset
   356  	}
   357  
   358  	out[sz-start] = int64(b.builder.DataLen() - (int(delta) - int(first)))
   359  }
   360  
   361  // CopyValues copies the raw binary data bytes out, out should be a []byte
   362  // with at least ValuesSize bytes allocated to copy into.
   363  func (b *BinaryMemoTable) CopyValues(out interface{}) {
   364  	b.CopyValuesSubset(0, out)
   365  }
   366  
   367  // CopyValuesSubset copies the raw binary data bytes out starting with the value
   368  // at the index start, out should be a []byte with at least ValuesSize bytes allocated
   369  func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
   370  	if b.builder.Len() <= start {
   371  		return
   372  	}
   373  
   374  	var (
   375  		first  = b.findOffset(0)
   376  		offset = b.findOffset(int(start))
   377  		length = b.builder.DataLen() - int(offset-first)
   378  	)
   379  
   380  	outval := out.([]byte)
   381  	copy(outval, b.builder.Value(start)[0:length])
   382  }
   383  
   384  func (b *BinaryMemoTable) WriteOut(out []byte) {
   385  	b.CopyValues(out)
   386  }
   387  
   388  func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
   389  	b.CopyValuesSubset(start, out)
   390  }
   391  
   392  // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
   393  // track of the fixed width when inserting the null value the databuffer holds a
   394  // zero length byte slice for the null value (if found)
   395  func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
   396  	if start >= b.Size() {
   397  		return
   398  	}
   399  
   400  	null, exists := b.GetNull()
   401  	if !exists || null < start {
   402  		// nothing to skip, proceed as usual
   403  		b.CopyValuesSubset(start, out)
   404  		return
   405  	}
   406  
   407  	var (
   408  		leftOffset  = b.findOffset(start)
   409  		nullOffset  = b.findOffset(null)
   410  		leftSize    = nullOffset - leftOffset
   411  		rightOffset = leftOffset + uintptr(b.ValuesSize())
   412  	)
   413  
   414  	if leftSize > 0 {
   415  		copy(out, b.builder.Value(start)[0:leftSize])
   416  	}
   417  
   418  	rightSize := rightOffset - nullOffset
   419  	if rightSize > 0 {
   420  		// skip the null fixed size value
   421  		copy(out[int(leftSize)+width:], b.builder.Value(null + 1)[0:rightSize])
   422  	}
   423  }
   424  
   425  // VisitValues exists to run the visitFn on each value currently in the hash table.
   426  func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
   427  	for i := int(start); i < b.Size(); i++ {
   428  		visitFn(b.builder.Value(i))
   429  	}
   430  }
   431  
   432  // Release is used to tell the underlying builder that it can release the memory allocated
   433  // when the reference count reaches 0, this is safe to be called from multiple goroutines
   434  // simultaneously
   435  func (b *BinaryMemoTable) Release() { b.builder.Release() }
   436  
   437  // Retain increases the ref count, it is safe to call it from multiple goroutines
   438  // simultaneously.
   439  func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
   440  
   441  // ValuesSize returns the current total size of all the raw bytes that have been inserted
   442  // into the memotable so far.
   443  func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }