github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/hashing/xxh3_memo_table.go (about)

     1  // Licensed to the Apache Software Foundation (ASF) under one
     2  // or more contributor license agreements.  See the NOTICE file
     3  // distributed with this work for additional information
     4  // regarding copyright ownership.  The ASF licenses this file
     5  // to you under the Apache License, Version 2.0 (the
     6  // "License"); you may not use this file except in compliance
     7  // with the License.  You may obtain a copy of the License at
     8  //
     9  // http://www.apache.org/licenses/LICENSE-2.0
    10  //
    11  // Unless required by applicable law or agreed to in writing, software
    12  // distributed under the License is distributed on an "AS IS" BASIS,
    13  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14  // See the License for the specific language governing permissions and
    15  // limitations under the License.
    16  
    17  // Package hashing provides utilities for and an implementation of a hash
    18  // table which is more performant than the default go map implementation
    19  // by leveraging xxh3 and some custom hash functions.
    20  package hashing
    21  
    22  import (
    23  	"bytes"
    24  	"math"
    25  	"math/bits"
    26  	"reflect"
    27  	"unsafe"
    28  
    29  	"github.com/apache/arrow/go/v7/arrow"
    30  	"github.com/apache/arrow/go/v7/arrow/array"
    31  	"github.com/apache/arrow/go/v7/arrow/memory"
    32  	"github.com/apache/arrow/go/v7/parquet"
    33  
    34  	"github.com/zeebo/xxh3"
    35  )
    36  
    37  //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl
    38  
    39  func hashInt(val uint64, alg uint64) uint64 {
    40  	// Two of xxhash's prime multipliers (which are chosen for their
    41  	// bit dispersion properties)
    42  	var multipliers = [2]uint64{11400714785074694791, 14029467366897019727}
    43  	// Multiplying by the prime number mixes the low bits into the high bits,
    44  	// then byte-swapping (which is a single CPU instruction) allows the
    45  	// combined high and low bits to participate in the initial hash table index.
    46  	return bits.ReverseBytes64(multipliers[alg] * val)
    47  }
    48  
    49  func hashFloat32(val float32, alg uint64) uint64 {
    50  	// grab the raw byte pattern of the
    51  	bt := *(*[4]byte)(unsafe.Pointer(&val))
    52  	x := uint64(*(*uint32)(unsafe.Pointer(&bt[0])))
    53  	hx := hashInt(x, alg)
    54  	hy := hashInt(x, alg^1)
    55  	return 4 ^ hx ^ hy
    56  }
    57  
    58  func hashFloat64(val float64, alg uint64) uint64 {
    59  	bt := *(*[8]byte)(unsafe.Pointer(&val))
    60  	hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg)
    61  	hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1)
    62  	return 8 ^ hx ^ hy
    63  }
    64  
    65  func hashString(val string, alg uint64) uint64 {
    66  	buf := *(*[]byte)(unsafe.Pointer(&val))
    67  	(*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val)
    68  	return hash(buf, alg)
    69  }
    70  
    71  // prime constants used for slightly increasing the hash quality further
    72  var exprimes = [2]uint64{1609587929392839161, 9650029242287828579}
    73  
    74  // for smaller amounts of bytes this is faster than even calling into
    75  // xxh3 to do the hash, so we specialize in order to get the benefits
    76  // of that performance.
    77  func hash(b []byte, alg uint64) uint64 {
    78  	n := uint32(len(b))
    79  	if n <= 16 {
    80  		switch {
    81  		case n > 8:
    82  			// 8 < length <= 16
    83  			// apply same principle as above, but as two 64-bit ints
    84  			x := *(*uint64)(unsafe.Pointer(&b[n-8]))
    85  			y := *(*uint64)(unsafe.Pointer(&b[0]))
    86  			hx := hashInt(x, alg)
    87  			hy := hashInt(y, alg^1)
    88  			return uint64(n) ^ hx ^ hy
    89  		case n >= 4:
    90  			// 4 < length <= 8
    91  			// we can read the bytes as two overlapping 32-bit ints, apply different
    92  			// hash functions to each in parallel
    93  			// then xor the results
    94  			x := *(*uint32)(unsafe.Pointer(&b[n-4]))
    95  			y := *(*uint32)(unsafe.Pointer(&b[0]))
    96  			hx := hashInt(uint64(x), alg)
    97  			hy := hashInt(uint64(y), alg^1)
    98  			return uint64(n) ^ hx ^ hy
    99  		case n > 0:
   100  			x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1]))
   101  			return hashInt(uint64(x), alg)
   102  		case n == 0:
   103  			return 1
   104  		}
   105  	}
   106  
   107  	// increase differentiation enough to improve hash quality
   108  	return xxh3.Hash(b) + exprimes[alg]
   109  }
   110  
   111  const (
   112  	sentinel   uint64 = 0
   113  	loadFactor int64  = 2
   114  )
   115  
   116  func max(a, b uint64) uint64 {
   117  	if a > b {
   118  		return a
   119  	}
   120  	return b
   121  }
   122  
   123  var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) }
   124  
   125  // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table
   126  const KeyNotFound = -1
   127  
   128  // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder
   129  // to construct the actual data in an easy to pass around way with minimal copies
   130  // while using a hash table to keep track of the indexes into the dictionary that
   131  // is created as we go.
   132  type BinaryMemoTable struct {
   133  	tbl     *Int32HashTable
   134  	builder *array.BinaryBuilder
   135  	nullIdx int
   136  }
   137  
   138  // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will
   139  // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used.
   140  // initial and valuesize can be used to pre-allocate the table to reduce allocations. With
   141  // initial being the initial number of entries to allocate for and valuesize being the starting
   142  // amount of space allocated for writing the actual binary data.
   143  func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable {
   144  	if mem == nil {
   145  		mem = memory.DefaultAllocator
   146  	}
   147  	bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary)
   148  	bldr.Reserve(int(initial))
   149  	datasize := valuesize
   150  	if datasize <= 0 {
   151  		datasize = initial * 4
   152  	}
   153  	bldr.ReserveData(datasize)
   154  	return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound}
   155  }
   156  
   157  // Reset dumps all of the data in the table allowing it to be reutilized.
   158  func (s *BinaryMemoTable) Reset() {
   159  	s.tbl.Reset(32)
   160  	s.builder.NewArray().Release()
   161  	s.builder.Reserve(int(32))
   162  	s.builder.ReserveData(int(32) * 4)
   163  	s.nullIdx = KeyNotFound
   164  }
   165  
   166  // GetNull returns the index of a null that has been inserted into the table or
   167  // KeyNotFound. The bool returned will be true if there was a null inserted into
   168  // the table, and false otherwise.
   169  func (s *BinaryMemoTable) GetNull() (int, bool) {
   170  	return int(s.nullIdx), s.nullIdx != KeyNotFound
   171  }
   172  
   173  // Size returns the current size of the memo table including the null value
   174  // if one has been inserted.
   175  func (s *BinaryMemoTable) Size() int {
   176  	sz := int(s.tbl.size)
   177  	if _, ok := s.GetNull(); ok {
   178  		sz++
   179  	}
   180  	return sz
   181  }
   182  
   183  // helper function to easily return a byte slice for any given value
   184  // regardless of the type if it's a []byte, parquet.ByteArray,
   185  // parquet.FixedLenByteArray or string.
   186  func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte {
   187  	switch v := val.(type) {
   188  	case []byte:
   189  		return v
   190  	case parquet.ByteArray:
   191  		return *(*[]byte)(unsafe.Pointer(&v))
   192  	case parquet.FixedLenByteArray:
   193  		return *(*[]byte)(unsafe.Pointer(&v))
   194  	case string:
   195  		var out []byte
   196  		h := (*reflect.StringHeader)(unsafe.Pointer(&v))
   197  		s := (*reflect.SliceHeader)(unsafe.Pointer(&out))
   198  		s.Data = h.Data
   199  		s.Len = h.Len
   200  		s.Cap = h.Len
   201  		return out
   202  	default:
   203  		panic("invalid type for binarymemotable")
   204  	}
   205  }
   206  
   207  // helper function to get the hash value regardless of the underlying binary type
   208  func (BinaryMemoTable) getHash(val interface{}) uint64 {
   209  	switch v := val.(type) {
   210  	case string:
   211  		return hashString(v, 0)
   212  	case []byte:
   213  		return hash(v, 0)
   214  	case parquet.ByteArray:
   215  		return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
   216  	case parquet.FixedLenByteArray:
   217  		return hash(*(*[]byte)(unsafe.Pointer(&v)), 0)
   218  	default:
   219  		panic("invalid type for binarymemotable")
   220  	}
   221  }
   222  
   223  // helper function to append the given value to the builder regardless
   224  // of the underlying binary type.
   225  func (b *BinaryMemoTable) appendVal(val interface{}) {
   226  	switch v := val.(type) {
   227  	case string:
   228  		b.builder.AppendString(v)
   229  	case []byte:
   230  		b.builder.Append(v)
   231  	case parquet.ByteArray:
   232  		b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
   233  	case parquet.FixedLenByteArray:
   234  		b.builder.Append(*(*[]byte)(unsafe.Pointer(&v)))
   235  	}
   236  }
   237  
   238  func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) {
   239  	return b.tbl.Lookup(h, func(i int32) bool {
   240  		return bytes.Equal(val, b.builder.Value(int(i)))
   241  	})
   242  }
   243  
   244  // Get returns the index of the specified value in the table or KeyNotFound,
   245  // and a boolean indicating whether it was found in the table.
   246  func (b *BinaryMemoTable) Get(val interface{}) (int, bool) {
   247  	if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok {
   248  		return int(p.payload.val), ok
   249  	}
   250  	return KeyNotFound, false
   251  }
   252  
   253  // GetOrInsert returns the index of the given value in the table, if not found
   254  // it is inserted into the table. The return value 'found' indicates whether the value
   255  // was found in the table (true) or inserted (false) along with any possible error.
   256  func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) {
   257  	h := b.getHash(val)
   258  	p, found := b.lookup(h, b.valAsByteSlice(val))
   259  	if found {
   260  		idx = int(p.payload.val)
   261  	} else {
   262  		idx = b.Size()
   263  		b.appendVal(val)
   264  		b.tbl.Insert(p, h, int32(idx), -1)
   265  	}
   266  	return
   267  }
   268  
   269  // GetOrInsertNull retrieves the index of a null in the table or inserts
   270  // null into the table, returning the index and a boolean indicating if it was
   271  // found in the table (true) or was inserted (false).
   272  func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) {
   273  	idx, found = b.GetNull()
   274  	if !found {
   275  		idx = b.Size()
   276  		b.nullIdx = idx
   277  		b.builder.AppendNull()
   278  	}
   279  	return
   280  }
   281  
   282  // helper function to get the offset into the builder data for a given
   283  // index value.
   284  func (b *BinaryMemoTable) findOffset(idx int) uintptr {
   285  	val := b.builder.Value(idx)
   286  	for len(val) == 0 {
   287  		idx++
   288  		if idx >= b.builder.Len() {
   289  			break
   290  		}
   291  		val = b.builder.Value(idx)
   292  	}
   293  	if len(val) != 0 {
   294  		return uintptr(unsafe.Pointer(&val[0]))
   295  	}
   296  	return uintptr(b.builder.DataLen()) + b.findOffset(0)
   297  }
   298  
   299  // CopyOffsets copies the list of offsets into the passed in slice, the offsets
   300  // being the start and end values of the underlying allocated bytes in the builder
   301  // for the individual values of the table. out should be at least sized to Size()+1
   302  func (b *BinaryMemoTable) CopyOffsets(out []int8) {
   303  	b.CopyOffsetsSubset(0, out)
   304  }
   305  
   306  // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets,
   307  // it gets a subset of the offsets in the table starting at the index provided by "start".
   308  func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) {
   309  	if b.builder.Len() <= start {
   310  		return
   311  	}
   312  
   313  	first := b.findOffset(0)
   314  	delta := b.findOffset(start)
   315  	for i := start; i < b.Size(); i++ {
   316  		offset := int8(b.findOffset(i) - delta)
   317  		out[i-start] = offset
   318  	}
   319  
   320  	out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first))
   321  }
   322  
   323  // CopyValues copies the raw binary data bytes out, out should be a []byte
   324  // with at least ValuesSize bytes allocated to copy into.
   325  func (b *BinaryMemoTable) CopyValues(out interface{}) {
   326  	b.CopyValuesSubset(0, out)
   327  }
   328  
   329  // CopyValuesSubset copies the raw binary data bytes out starting with the value
   330  // at the index start, out should be a []byte with at least ValuesSize bytes allocated
   331  func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) {
   332  	var (
   333  		first  = b.findOffset(0)
   334  		offset = b.findOffset(int(start))
   335  		length = b.builder.DataLen() - int(offset-first)
   336  	)
   337  
   338  	outval := out.([]byte)
   339  	copy(outval, b.builder.Value(start)[0:length])
   340  }
   341  
   342  func (b *BinaryMemoTable) WriteOut(out []byte) {
   343  	b.CopyValues(out)
   344  }
   345  
   346  func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) {
   347  	b.CopyValuesSubset(start, out)
   348  }
   349  
   350  // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep
   351  // track of the fixed width when inserting the null value the databuffer holds a
   352  // zero length byte slice for the null value (if found)
   353  func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) {
   354  	if start >= b.Size() {
   355  		return
   356  	}
   357  
   358  	null, exists := b.GetNull()
   359  	if !exists || null < start {
   360  		// nothing to skip, proceed as usual
   361  		b.CopyValuesSubset(start, out)
   362  		return
   363  	}
   364  
   365  	var (
   366  		leftOffset = b.findOffset(start)
   367  		nullOffset = b.findOffset(null)
   368  		leftSize   = nullOffset - leftOffset
   369  	)
   370  
   371  	if leftSize > 0 {
   372  		copy(out, b.builder.Value(start)[0:leftSize])
   373  	}
   374  
   375  	rightSize := b.ValuesSize() - int(nullOffset)
   376  	if rightSize > 0 {
   377  		// skip the null fixed size value
   378  		copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize])
   379  	}
   380  }
   381  
   382  // VisitValues exists to run the visitFn on each value currently in the hash table.
   383  func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) {
   384  	for i := int(start); i < b.Size(); i++ {
   385  		visitFn(b.builder.Value(i))
   386  	}
   387  }
   388  
   389  // Release is used to tell the underlying builder that it can release the memory allocated
   390  // when the reference count reaches 0, this is safe to be called from multiple goroutines
   391  // simultaneously
   392  func (b *BinaryMemoTable) Release() { b.builder.Release() }
   393  
   394  // Retain increases the ref count, it is safe to call it from multiple goroutines
   395  // simultaneously.
   396  func (b *BinaryMemoTable) Retain() { b.builder.Retain() }
   397  
   398  // ValuesSize returns the current total size of all the raw bytes that have been inserted
   399  // into the memotable so far.
   400  func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }