github.com/apache/arrow/go/v10@v10.0.1/internal/hashing/xxh3_memo_table.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package hashing provides utilities for and an implementation of a hash 18 // table which is more performant than the default go map implementation 19 // by leveraging xxh3 and some custom hash functions. 20 package hashing 21 22 import ( 23 "bytes" 24 "math" 25 "math/bits" 26 "reflect" 27 "unsafe" 28 29 "github.com/apache/arrow/go/v10/parquet" 30 31 "github.com/zeebo/xxh3" 32 ) 33 34 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl 35 36 type TypeTraits interface { 37 BytesRequired(n int) int 38 } 39 40 // MemoTable interface for hash tables and dictionary encoding. 41 // 42 // Values will remember the order they are inserted to generate a valid 43 // dictionary. 44 type MemoTable interface { 45 TypeTraits() TypeTraits 46 // Reset drops everything in the table allowing it to be reused 47 Reset() 48 // Size returns the current number of unique values stored in 49 // the table, including whether or not a null value has been 50 // inserted via GetOrInsertNull. 51 Size() int 52 // GetOrInsert returns the index of the table the specified value is, 53 // and a boolean indicating whether or not the value was found in 54 // the table (if false, the value was inserted). An error is returned 55 // if val is not the appropriate type for the table. 56 GetOrInsert(val interface{}) (idx int, existed bool, err error) 57 // GetOrInsertNull returns the index of the null value in the table, 58 // inserting one if it hasn't already been inserted. It returns a boolean 59 // indicating if the null value already existed or not in the table. 60 GetOrInsertNull() (idx int, existed bool) 61 // GetNull returns the index of the null value in the table, but does not 62 // insert one if it doesn't already exist. Will return -1 if it doesn't exist 63 // indicated by a false value for the boolean. 64 GetNull() (idx int, exists bool) 65 // WriteOut copys the unique values of the memotable out to the byte slice 66 // provided. Must have allocated enough bytes for all the values. 67 WriteOut(out []byte) 68 // WriteOutSubset is like WriteOut, but only writes a subset of values 69 // starting with the index offset. 70 WriteOutSubset(offset int, out []byte) 71 } 72 73 type NumericMemoTable interface { 74 MemoTable 75 WriteOutLE(out []byte) 76 WriteOutSubsetLE(offset int, out []byte) 77 } 78 79 func hashInt(val uint64, alg uint64) uint64 { 80 // Two of xxhash's prime multipliers (which are chosen for their 81 // bit dispersion properties) 82 var multipliers = [2]uint64{11400714785074694791, 14029467366897019727} 83 // Multiplying by the prime number mixes the low bits into the high bits, 84 // then byte-swapping (which is a single CPU instruction) allows the 85 // combined high and low bits to participate in the initial hash table index. 86 return bits.ReverseBytes64(multipliers[alg] * val) 87 } 88 89 func hashFloat32(val float32, alg uint64) uint64 { 90 // grab the raw byte pattern of the 91 bt := *(*[4]byte)(unsafe.Pointer(&val)) 92 x := uint64(*(*uint32)(unsafe.Pointer(&bt[0]))) 93 hx := hashInt(x, alg) 94 hy := hashInt(x, alg^1) 95 return 4 ^ hx ^ hy 96 } 97 98 func hashFloat64(val float64, alg uint64) uint64 { 99 bt := *(*[8]byte)(unsafe.Pointer(&val)) 100 hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg) 101 hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1) 102 return 8 ^ hx ^ hy 103 } 104 105 func hashString(val string, alg uint64) uint64 { 106 buf := *(*[]byte)(unsafe.Pointer(&val)) 107 (*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val) 108 return hash(buf, alg) 109 } 110 111 // prime constants used for slightly increasing the hash quality further 112 var exprimes = [2]uint64{1609587929392839161, 9650029242287828579} 113 114 // for smaller amounts of bytes this is faster than even calling into 115 // xxh3 to do the hash, so we specialize in order to get the benefits 116 // of that performance. 117 func hash(b []byte, alg uint64) uint64 { 118 n := uint32(len(b)) 119 if n <= 16 { 120 switch { 121 case n > 8: 122 // 8 < length <= 16 123 // apply same principle as above, but as two 64-bit ints 124 x := *(*uint64)(unsafe.Pointer(&b[n-8])) 125 y := *(*uint64)(unsafe.Pointer(&b[0])) 126 hx := hashInt(x, alg) 127 hy := hashInt(y, alg^1) 128 return uint64(n) ^ hx ^ hy 129 case n >= 4: 130 // 4 < length <= 8 131 // we can read the bytes as two overlapping 32-bit ints, apply different 132 // hash functions to each in parallel 133 // then xor the results 134 x := *(*uint32)(unsafe.Pointer(&b[n-4])) 135 y := *(*uint32)(unsafe.Pointer(&b[0])) 136 hx := hashInt(uint64(x), alg) 137 hy := hashInt(uint64(y), alg^1) 138 return uint64(n) ^ hx ^ hy 139 case n > 0: 140 x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1])) 141 return hashInt(uint64(x), alg) 142 case n == 0: 143 return 1 144 } 145 } 146 147 // increase differentiation enough to improve hash quality 148 return xxh3.Hash(b) + exprimes[alg] 149 } 150 151 const ( 152 sentinel uint64 = 0 153 loadFactor int64 = 2 154 ) 155 156 func max(a, b uint64) uint64 { 157 if a > b { 158 return a 159 } 160 return b 161 } 162 163 var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } 164 165 // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table 166 const KeyNotFound = -1 167 168 type BinaryBuilderIFace interface { 169 Reserve(int) 170 ReserveData(int) 171 Retain() 172 Resize(int) 173 ResizeData(int) 174 Release() 175 DataLen() int 176 Value(int) []byte 177 Len() int 178 AppendNull() 179 AppendString(string) 180 Append([]byte) 181 } 182 183 // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder 184 // to construct the actual data in an easy to pass around way with minimal copies 185 // while using a hash table to keep track of the indexes into the dictionary that 186 // is created as we go. 187 type BinaryMemoTable struct { 188 tbl *Int32HashTable 189 builder BinaryBuilderIFace 190 nullIdx int 191 } 192 193 // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will 194 // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. 195 // initial and valuesize can be used to pre-allocate the table to reduce allocations. With 196 // initial being the initial number of entries to allocate for and valuesize being the starting 197 // amount of space allocated for writing the actual binary data. 198 func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable { 199 bldr.Reserve(int(initial)) 200 datasize := valuesize 201 if datasize <= 0 { 202 datasize = initial * 4 203 } 204 bldr.ReserveData(datasize) 205 return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} 206 } 207 208 type unimplementedtraits struct{} 209 210 func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") } 211 212 func (BinaryMemoTable) TypeTraits() TypeTraits { 213 return unimplementedtraits{} 214 } 215 216 // Reset dumps all of the data in the table allowing it to be reutilized. 217 func (s *BinaryMemoTable) Reset() { 218 s.tbl.Reset(32) 219 s.builder.Resize(0) 220 s.builder.ResizeData(0) 221 s.builder.Reserve(int(32)) 222 s.builder.ReserveData(int(32) * 4) 223 s.nullIdx = KeyNotFound 224 } 225 226 // GetNull returns the index of a null that has been inserted into the table or 227 // KeyNotFound. The bool returned will be true if there was a null inserted into 228 // the table, and false otherwise. 229 func (s *BinaryMemoTable) GetNull() (int, bool) { 230 return int(s.nullIdx), s.nullIdx != KeyNotFound 231 } 232 233 // Size returns the current size of the memo table including the null value 234 // if one has been inserted. 235 func (s *BinaryMemoTable) Size() int { 236 sz := int(s.tbl.size) 237 if _, ok := s.GetNull(); ok { 238 sz++ 239 } 240 return sz 241 } 242 243 // helper function to easily return a byte slice for any given value 244 // regardless of the type if it's a []byte, parquet.ByteArray, 245 // parquet.FixedLenByteArray or string. 246 func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { 247 switch v := val.(type) { 248 case []byte: 249 return v 250 case parquet.ByteArray: 251 return *(*[]byte)(unsafe.Pointer(&v)) 252 case parquet.FixedLenByteArray: 253 return *(*[]byte)(unsafe.Pointer(&v)) 254 case string: 255 var out []byte 256 h := (*reflect.StringHeader)(unsafe.Pointer(&v)) 257 s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) 258 s.Data = h.Data 259 s.Len = h.Len 260 s.Cap = h.Len 261 return out 262 default: 263 panic("invalid type for binarymemotable") 264 } 265 } 266 267 // helper function to get the hash value regardless of the underlying binary type 268 func (BinaryMemoTable) getHash(val interface{}) uint64 { 269 switch v := val.(type) { 270 case string: 271 return hashString(v, 0) 272 case []byte: 273 return hash(v, 0) 274 case parquet.ByteArray: 275 return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) 276 case parquet.FixedLenByteArray: 277 return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) 278 default: 279 panic("invalid type for binarymemotable") 280 } 281 } 282 283 // helper function to append the given value to the builder regardless 284 // of the underlying binary type. 285 func (b *BinaryMemoTable) appendVal(val interface{}) { 286 switch v := val.(type) { 287 case string: 288 b.builder.AppendString(v) 289 case []byte: 290 b.builder.Append(v) 291 case parquet.ByteArray: 292 b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) 293 case parquet.FixedLenByteArray: 294 b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) 295 } 296 } 297 298 func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { 299 return b.tbl.Lookup(h, func(i int32) bool { 300 return bytes.Equal(val, b.builder.Value(int(i))) 301 }) 302 } 303 304 // Get returns the index of the specified value in the table or KeyNotFound, 305 // and a boolean indicating whether it was found in the table. 306 func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { 307 if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { 308 return int(p.payload.val), ok 309 } 310 return KeyNotFound, false 311 } 312 313 // GetOrInsert returns the index of the given value in the table, if not found 314 // it is inserted into the table. The return value 'found' indicates whether the value 315 // was found in the table (true) or inserted (false) along with any possible error. 316 func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 317 h := b.getHash(val) 318 p, found := b.lookup(h, b.valAsByteSlice(val)) 319 if found { 320 idx = int(p.payload.val) 321 } else { 322 idx = b.Size() 323 b.appendVal(val) 324 b.tbl.Insert(p, h, int32(idx), -1) 325 } 326 return 327 } 328 329 // GetOrInsertNull retrieves the index of a null in the table or inserts 330 // null into the table, returning the index and a boolean indicating if it was 331 // found in the table (true) or was inserted (false). 332 func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { 333 idx, found = b.GetNull() 334 if !found { 335 idx = b.Size() 336 b.nullIdx = idx 337 b.builder.AppendNull() 338 } 339 return 340 } 341 342 // helper function to get the offset into the builder data for a given 343 // index value. 344 func (b *BinaryMemoTable) findOffset(idx int) uintptr { 345 val := b.builder.Value(idx) 346 for len(val) == 0 { 347 idx++ 348 if idx >= b.builder.Len() { 349 break 350 } 351 val = b.builder.Value(idx) 352 } 353 if len(val) != 0 { 354 return uintptr(unsafe.Pointer(&val[0])) 355 } 356 return uintptr(b.builder.DataLen()) + b.findOffset(0) 357 } 358 359 // CopyOffsets copies the list of offsets into the passed in slice, the offsets 360 // being the start and end values of the underlying allocated bytes in the builder 361 // for the individual values of the table. out should be at least sized to Size()+1 362 func (b *BinaryMemoTable) CopyOffsets(out []int32) { 363 b.CopyOffsetsSubset(0, out) 364 } 365 366 // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, 367 // it gets a subset of the offsets in the table starting at the index provided by "start". 368 func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) { 369 if b.builder.Len() <= start { 370 return 371 } 372 373 first := b.findOffset(0) 374 delta := b.findOffset(start) 375 sz := b.Size() 376 for i := start; i < sz; i++ { 377 offset := int32(b.findOffset(i) - delta) 378 out[i-start] = offset 379 } 380 381 out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first))) 382 } 383 384 // CopyValues copies the raw binary data bytes out, out should be a []byte 385 // with at least ValuesSize bytes allocated to copy into. 386 func (b *BinaryMemoTable) CopyValues(out interface{}) { 387 b.CopyValuesSubset(0, out) 388 } 389 390 // CopyValuesSubset copies the raw binary data bytes out starting with the value 391 // at the index start, out should be a []byte with at least ValuesSize bytes allocated 392 func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { 393 if b.builder.Len() <= start { 394 return 395 } 396 397 var ( 398 first = b.findOffset(0) 399 offset = b.findOffset(int(start)) 400 length = b.builder.DataLen() - int(offset-first) 401 ) 402 403 outval := out.([]byte) 404 copy(outval, b.builder.Value(start)[0:length]) 405 } 406 407 func (b *BinaryMemoTable) WriteOut(out []byte) { 408 b.CopyValues(out) 409 } 410 411 func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) { 412 b.CopyValuesSubset(start, out) 413 } 414 415 // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep 416 // track of the fixed width when inserting the null value the databuffer holds a 417 // zero length byte slice for the null value (if found) 418 func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { 419 if start >= b.Size() { 420 return 421 } 422 423 null, exists := b.GetNull() 424 if !exists || null < start { 425 // nothing to skip, proceed as usual 426 b.CopyValuesSubset(start, out) 427 return 428 } 429 430 var ( 431 leftOffset = b.findOffset(start) 432 nullOffset = b.findOffset(null) 433 leftSize = nullOffset - leftOffset 434 ) 435 436 if leftSize > 0 { 437 copy(out, b.builder.Value(start)[0:leftSize]) 438 } 439 440 rightSize := b.ValuesSize() - int(nullOffset) 441 if rightSize > 0 { 442 // skip the null fixed size value 443 copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize]) 444 } 445 } 446 447 // VisitValues exists to run the visitFn on each value currently in the hash table. 448 func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { 449 for i := int(start); i < b.Size(); i++ { 450 visitFn(b.builder.Value(i)) 451 } 452 } 453 454 // Release is used to tell the underlying builder that it can release the memory allocated 455 // when the reference count reaches 0, this is safe to be called from multiple goroutines 456 // simultaneously 457 func (b *BinaryMemoTable) Release() { b.builder.Release() } 458 459 // Retain increases the ref count, it is safe to call it from multiple goroutines 460 // simultaneously. 461 func (b *BinaryMemoTable) Retain() { b.builder.Retain() } 462 463 // ValuesSize returns the current total size of all the raw bytes that have been inserted 464 // into the memotable so far. 465 func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }