github.com/apache/arrow/go/v7@v7.0.1/parquet/internal/hashing/xxh3_memo_table.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package hashing provides utilities for and an implementation of a hash 18 // table which is more performant than the default go map implementation 19 // by leveraging xxh3 and some custom hash functions. 20 package hashing 21 22 import ( 23 "bytes" 24 "math" 25 "math/bits" 26 "reflect" 27 "unsafe" 28 29 "github.com/apache/arrow/go/v7/arrow" 30 "github.com/apache/arrow/go/v7/arrow/array" 31 "github.com/apache/arrow/go/v7/arrow/memory" 32 "github.com/apache/arrow/go/v7/parquet" 33 34 "github.com/zeebo/xxh3" 35 ) 36 37 //go:generate go run ../../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl 38 39 func hashInt(val uint64, alg uint64) uint64 { 40 // Two of xxhash's prime multipliers (which are chosen for their 41 // bit dispersion properties) 42 var multipliers = [2]uint64{11400714785074694791, 14029467366897019727} 43 // Multiplying by the prime number mixes the low bits into the high bits, 44 // then byte-swapping (which is a single CPU instruction) allows the 45 // combined high and low bits to participate in the initial hash table index. 46 return bits.ReverseBytes64(multipliers[alg] * val) 47 } 48 49 func hashFloat32(val float32, alg uint64) uint64 { 50 // grab the raw byte pattern of the 51 bt := *(*[4]byte)(unsafe.Pointer(&val)) 52 x := uint64(*(*uint32)(unsafe.Pointer(&bt[0]))) 53 hx := hashInt(x, alg) 54 hy := hashInt(x, alg^1) 55 return 4 ^ hx ^ hy 56 } 57 58 func hashFloat64(val float64, alg uint64) uint64 { 59 bt := *(*[8]byte)(unsafe.Pointer(&val)) 60 hx := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[4]))), alg) 61 hy := hashInt(uint64(*(*uint32)(unsafe.Pointer(&bt[0]))), alg^1) 62 return 8 ^ hx ^ hy 63 } 64 65 func hashString(val string, alg uint64) uint64 { 66 buf := *(*[]byte)(unsafe.Pointer(&val)) 67 (*reflect.SliceHeader)(unsafe.Pointer(&buf)).Cap = len(val) 68 return hash(buf, alg) 69 } 70 71 // prime constants used for slightly increasing the hash quality further 72 var exprimes = [2]uint64{1609587929392839161, 9650029242287828579} 73 74 // for smaller amounts of bytes this is faster than even calling into 75 // xxh3 to do the hash, so we specialize in order to get the benefits 76 // of that performance. 77 func hash(b []byte, alg uint64) uint64 { 78 n := uint32(len(b)) 79 if n <= 16 { 80 switch { 81 case n > 8: 82 // 8 < length <= 16 83 // apply same principle as above, but as two 64-bit ints 84 x := *(*uint64)(unsafe.Pointer(&b[n-8])) 85 y := *(*uint64)(unsafe.Pointer(&b[0])) 86 hx := hashInt(x, alg) 87 hy := hashInt(y, alg^1) 88 return uint64(n) ^ hx ^ hy 89 case n >= 4: 90 // 4 < length <= 8 91 // we can read the bytes as two overlapping 32-bit ints, apply different 92 // hash functions to each in parallel 93 // then xor the results 94 x := *(*uint32)(unsafe.Pointer(&b[n-4])) 95 y := *(*uint32)(unsafe.Pointer(&b[0])) 96 hx := hashInt(uint64(x), alg) 97 hy := hashInt(uint64(y), alg^1) 98 return uint64(n) ^ hx ^ hy 99 case n > 0: 100 x := uint32((n << 24) ^ (uint32(b[0]) << 16) ^ (uint32(b[n/2]) << 8) ^ uint32(b[n-1])) 101 return hashInt(uint64(x), alg) 102 case n == 0: 103 return 1 104 } 105 } 106 107 // increase differentiation enough to improve hash quality 108 return xxh3.Hash(b) + exprimes[alg] 109 } 110 111 const ( 112 sentinel uint64 = 0 113 loadFactor int64 = 2 114 ) 115 116 func max(a, b uint64) uint64 { 117 if a > b { 118 return a 119 } 120 return b 121 } 122 123 var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } 124 125 // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table 126 const KeyNotFound = -1 127 128 // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder 129 // to construct the actual data in an easy to pass around way with minimal copies 130 // while using a hash table to keep track of the indexes into the dictionary that 131 // is created as we go. 132 type BinaryMemoTable struct { 133 tbl *Int32HashTable 134 builder *array.BinaryBuilder 135 nullIdx int 136 } 137 138 // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will 139 // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. 140 // initial and valuesize can be used to pre-allocate the table to reduce allocations. With 141 // initial being the initial number of entries to allocate for and valuesize being the starting 142 // amount of space allocated for writing the actual binary data. 143 func NewBinaryMemoTable(mem memory.Allocator, initial, valuesize int) *BinaryMemoTable { 144 if mem == nil { 145 mem = memory.DefaultAllocator 146 } 147 bldr := array.NewBinaryBuilder(mem, arrow.BinaryTypes.Binary) 148 bldr.Reserve(int(initial)) 149 datasize := valuesize 150 if datasize <= 0 { 151 datasize = initial * 4 152 } 153 bldr.ReserveData(datasize) 154 return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} 155 } 156 157 // Reset dumps all of the data in the table allowing it to be reutilized. 158 func (s *BinaryMemoTable) Reset() { 159 s.tbl.Reset(32) 160 s.builder.NewArray().Release() 161 s.builder.Reserve(int(32)) 162 s.builder.ReserveData(int(32) * 4) 163 s.nullIdx = KeyNotFound 164 } 165 166 // GetNull returns the index of a null that has been inserted into the table or 167 // KeyNotFound. The bool returned will be true if there was a null inserted into 168 // the table, and false otherwise. 169 func (s *BinaryMemoTable) GetNull() (int, bool) { 170 return int(s.nullIdx), s.nullIdx != KeyNotFound 171 } 172 173 // Size returns the current size of the memo table including the null value 174 // if one has been inserted. 175 func (s *BinaryMemoTable) Size() int { 176 sz := int(s.tbl.size) 177 if _, ok := s.GetNull(); ok { 178 sz++ 179 } 180 return sz 181 } 182 183 // helper function to easily return a byte slice for any given value 184 // regardless of the type if it's a []byte, parquet.ByteArray, 185 // parquet.FixedLenByteArray or string. 186 func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { 187 switch v := val.(type) { 188 case []byte: 189 return v 190 case parquet.ByteArray: 191 return *(*[]byte)(unsafe.Pointer(&v)) 192 case parquet.FixedLenByteArray: 193 return *(*[]byte)(unsafe.Pointer(&v)) 194 case string: 195 var out []byte 196 h := (*reflect.StringHeader)(unsafe.Pointer(&v)) 197 s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) 198 s.Data = h.Data 199 s.Len = h.Len 200 s.Cap = h.Len 201 return out 202 default: 203 panic("invalid type for binarymemotable") 204 } 205 } 206 207 // helper function to get the hash value regardless of the underlying binary type 208 func (BinaryMemoTable) getHash(val interface{}) uint64 { 209 switch v := val.(type) { 210 case string: 211 return hashString(v, 0) 212 case []byte: 213 return hash(v, 0) 214 case parquet.ByteArray: 215 return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) 216 case parquet.FixedLenByteArray: 217 return hash(*(*[]byte)(unsafe.Pointer(&v)), 0) 218 default: 219 panic("invalid type for binarymemotable") 220 } 221 } 222 223 // helper function to append the given value to the builder regardless 224 // of the underlying binary type. 225 func (b *BinaryMemoTable) appendVal(val interface{}) { 226 switch v := val.(type) { 227 case string: 228 b.builder.AppendString(v) 229 case []byte: 230 b.builder.Append(v) 231 case parquet.ByteArray: 232 b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) 233 case parquet.FixedLenByteArray: 234 b.builder.Append(*(*[]byte)(unsafe.Pointer(&v))) 235 } 236 } 237 238 func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { 239 return b.tbl.Lookup(h, func(i int32) bool { 240 return bytes.Equal(val, b.builder.Value(int(i))) 241 }) 242 } 243 244 // Get returns the index of the specified value in the table or KeyNotFound, 245 // and a boolean indicating whether it was found in the table. 246 func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { 247 if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { 248 return int(p.payload.val), ok 249 } 250 return KeyNotFound, false 251 } 252 253 // GetOrInsert returns the index of the given value in the table, if not found 254 // it is inserted into the table. The return value 'found' indicates whether the value 255 // was found in the table (true) or inserted (false) along with any possible error. 256 func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 257 h := b.getHash(val) 258 p, found := b.lookup(h, b.valAsByteSlice(val)) 259 if found { 260 idx = int(p.payload.val) 261 } else { 262 idx = b.Size() 263 b.appendVal(val) 264 b.tbl.Insert(p, h, int32(idx), -1) 265 } 266 return 267 } 268 269 // GetOrInsertNull retrieves the index of a null in the table or inserts 270 // null into the table, returning the index and a boolean indicating if it was 271 // found in the table (true) or was inserted (false). 272 func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { 273 idx, found = b.GetNull() 274 if !found { 275 idx = b.Size() 276 b.nullIdx = idx 277 b.builder.AppendNull() 278 } 279 return 280 } 281 282 // helper function to get the offset into the builder data for a given 283 // index value. 284 func (b *BinaryMemoTable) findOffset(idx int) uintptr { 285 val := b.builder.Value(idx) 286 for len(val) == 0 { 287 idx++ 288 if idx >= b.builder.Len() { 289 break 290 } 291 val = b.builder.Value(idx) 292 } 293 if len(val) != 0 { 294 return uintptr(unsafe.Pointer(&val[0])) 295 } 296 return uintptr(b.builder.DataLen()) + b.findOffset(0) 297 } 298 299 // CopyOffsets copies the list of offsets into the passed in slice, the offsets 300 // being the start and end values of the underlying allocated bytes in the builder 301 // for the individual values of the table. out should be at least sized to Size()+1 302 func (b *BinaryMemoTable) CopyOffsets(out []int8) { 303 b.CopyOffsetsSubset(0, out) 304 } 305 306 // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, 307 // it gets a subset of the offsets in the table starting at the index provided by "start". 308 func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int8) { 309 if b.builder.Len() <= start { 310 return 311 } 312 313 first := b.findOffset(0) 314 delta := b.findOffset(start) 315 for i := start; i < b.Size(); i++ { 316 offset := int8(b.findOffset(i) - delta) 317 out[i-start] = offset 318 } 319 320 out[b.Size()-start] = int8(b.builder.DataLen() - int(delta) - int(first)) 321 } 322 323 // CopyValues copies the raw binary data bytes out, out should be a []byte 324 // with at least ValuesSize bytes allocated to copy into. 325 func (b *BinaryMemoTable) CopyValues(out interface{}) { 326 b.CopyValuesSubset(0, out) 327 } 328 329 // CopyValuesSubset copies the raw binary data bytes out starting with the value 330 // at the index start, out should be a []byte with at least ValuesSize bytes allocated 331 func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { 332 var ( 333 first = b.findOffset(0) 334 offset = b.findOffset(int(start)) 335 length = b.builder.DataLen() - int(offset-first) 336 ) 337 338 outval := out.([]byte) 339 copy(outval, b.builder.Value(start)[0:length]) 340 } 341 342 func (b *BinaryMemoTable) WriteOut(out []byte) { 343 b.CopyValues(out) 344 } 345 346 func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) { 347 b.CopyValuesSubset(start, out) 348 } 349 350 // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep 351 // track of the fixed width when inserting the null value the databuffer holds a 352 // zero length byte slice for the null value (if found) 353 func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { 354 if start >= b.Size() { 355 return 356 } 357 358 null, exists := b.GetNull() 359 if !exists || null < start { 360 // nothing to skip, proceed as usual 361 b.CopyValuesSubset(start, out) 362 return 363 } 364 365 var ( 366 leftOffset = b.findOffset(start) 367 nullOffset = b.findOffset(null) 368 leftSize = nullOffset - leftOffset 369 ) 370 371 if leftSize > 0 { 372 copy(out, b.builder.Value(start)[0:leftSize]) 373 } 374 375 rightSize := b.ValuesSize() - int(nullOffset) 376 if rightSize > 0 { 377 // skip the null fixed size value 378 copy(out[int(leftSize)+width:], b.builder.Value(int(nullOffset))[0:rightSize]) 379 } 380 } 381 382 // VisitValues exists to run the visitFn on each value currently in the hash table. 383 func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { 384 for i := int(start); i < b.Size(); i++ { 385 visitFn(b.builder.Value(i)) 386 } 387 } 388 389 // Release is used to tell the underlying builder that it can release the memory allocated 390 // when the reference count reaches 0, this is safe to be called from multiple goroutines 391 // simultaneously 392 func (b *BinaryMemoTable) Release() { b.builder.Release() } 393 394 // Retain increases the ref count, it is safe to call it from multiple goroutines 395 // simultaneously. 396 func (b *BinaryMemoTable) Retain() { b.builder.Retain() } 397 398 // ValuesSize returns the current total size of all the raw bytes that have been inserted 399 // into the memotable so far. 400 func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }