github.com/apache/arrow/go/v14@v14.0.1/internal/hashing/xxh3_memo_table.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package hashing provides utilities for and an implementation of a hash 18 // table which is more performant than the default go map implementation 19 // by leveraging xxh3 and some custom hash functions. 20 package hashing 21 22 import ( 23 "bytes" 24 "math" 25 "reflect" 26 "unsafe" 27 ) 28 29 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl 30 31 type TypeTraits interface { 32 BytesRequired(n int) int 33 } 34 35 type ByteSlice interface { 36 Bytes() []byte 37 } 38 39 // MemoTable interface for hash tables and dictionary encoding. 40 // 41 // Values will remember the order they are inserted to generate a valid 42 // dictionary. 43 type MemoTable interface { 44 TypeTraits() TypeTraits 45 // Reset drops everything in the table allowing it to be reused 46 Reset() 47 // Size returns the current number of unique values stored in 48 // the table, including whether or not a null value has been 49 // inserted via GetOrInsertNull. 50 Size() int 51 // GetOrInsert returns the index of the table the specified value is, 52 // and a boolean indicating whether or not the value was found in 53 // the table (if false, the value was inserted). An error is returned 54 // if val is not the appropriate type for the table. 55 GetOrInsert(val interface{}) (idx int, existed bool, err error) 56 // GetOrInsertBytes returns the index of the table the specified value is, 57 // and a boolean indicating whether or not the value was found in 58 // the table (if false, the value was inserted). An error is returned 59 // if val is not the appropriate type for the table. This function is intended to be used by 60 // the BinaryMemoTable to prevent uncessary allocations of the data when converting from a []byte to interface{}. 61 GetOrInsertBytes(val []byte) (idx int, existed bool, err error) 62 // GetOrInsertNull returns the index of the null value in the table, 63 // inserting one if it hasn't already been inserted. It returns a boolean 64 // indicating if the null value already existed or not in the table. 65 GetOrInsertNull() (idx int, existed bool) 66 // GetNull returns the index of the null value in the table, but does not 67 // insert one if it doesn't already exist. Will return -1 if it doesn't exist 68 // indicated by a false value for the boolean. 69 GetNull() (idx int, exists bool) 70 // WriteOut copys the unique values of the memotable out to the byte slice 71 // provided. Must have allocated enough bytes for all the values. 72 WriteOut(out []byte) 73 // WriteOutSubset is like WriteOut, but only writes a subset of values 74 // starting with the index offset. 75 WriteOutSubset(offset int, out []byte) 76 } 77 78 type NumericMemoTable interface { 79 MemoTable 80 WriteOutLE(out []byte) 81 WriteOutSubsetLE(offset int, out []byte) 82 } 83 84 const ( 85 sentinel uint64 = 0 86 loadFactor int64 = 2 87 ) 88 89 func max(a, b uint64) uint64 { 90 if a > b { 91 return a 92 } 93 return b 94 } 95 96 var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } 97 98 // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table 99 const KeyNotFound = -1 100 101 type BinaryBuilderIFace interface { 102 Reserve(int) 103 ReserveData(int) 104 Retain() 105 Resize(int) 106 ResizeData(int) 107 Release() 108 DataLen() int 109 Value(int) []byte 110 Len() int 111 AppendNull() 112 AppendString(string) 113 Append([]byte) 114 } 115 116 // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder 117 // to construct the actual data in an easy to pass around way with minimal copies 118 // while using a hash table to keep track of the indexes into the dictionary that 119 // is created as we go. 120 type BinaryMemoTable struct { 121 tbl *Int32HashTable 122 builder BinaryBuilderIFace 123 nullIdx int 124 } 125 126 // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will 127 // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. 128 // initial and valuesize can be used to pre-allocate the table to reduce allocations. With 129 // initial being the initial number of entries to allocate for and valuesize being the starting 130 // amount of space allocated for writing the actual binary data. 131 func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable { 132 bldr.Reserve(int(initial)) 133 datasize := valuesize 134 if datasize <= 0 { 135 datasize = initial * 4 136 } 137 bldr.ReserveData(datasize) 138 return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} 139 } 140 141 type unimplementedtraits struct{} 142 143 func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") } 144 145 func (BinaryMemoTable) TypeTraits() TypeTraits { 146 return unimplementedtraits{} 147 } 148 149 // Reset dumps all of the data in the table allowing it to be reutilized. 150 func (s *BinaryMemoTable) Reset() { 151 s.tbl.Reset(32) 152 s.builder.Resize(0) 153 s.builder.ResizeData(0) 154 s.builder.Reserve(int(32)) 155 s.builder.ReserveData(int(32) * 4) 156 s.nullIdx = KeyNotFound 157 } 158 159 // GetNull returns the index of a null that has been inserted into the table or 160 // KeyNotFound. The bool returned will be true if there was a null inserted into 161 // the table, and false otherwise. 162 func (s *BinaryMemoTable) GetNull() (int, bool) { 163 return int(s.nullIdx), s.nullIdx != KeyNotFound 164 } 165 166 // Size returns the current size of the memo table including the null value 167 // if one has been inserted. 168 func (s *BinaryMemoTable) Size() int { 169 sz := int(s.tbl.size) 170 if _, ok := s.GetNull(); ok { 171 sz++ 172 } 173 return sz 174 } 175 176 // helper function to easily return a byte slice for any given value 177 // regardless of the type if it's a []byte, string, or fulfills the 178 // ByteSlice interface. 179 func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { 180 switch v := val.(type) { 181 case []byte: 182 return v 183 case ByteSlice: 184 return v.Bytes() 185 case string: 186 var out []byte 187 h := (*reflect.StringHeader)(unsafe.Pointer(&v)) 188 s := (*reflect.SliceHeader)(unsafe.Pointer(&out)) 189 s.Data = h.Data 190 s.Len = h.Len 191 s.Cap = h.Len 192 return out 193 default: 194 panic("invalid type for binarymemotable") 195 } 196 } 197 198 // helper function to get the hash value regardless of the underlying binary type 199 func (BinaryMemoTable) getHash(val interface{}) uint64 { 200 switch v := val.(type) { 201 case string: 202 return hashString(v, 0) 203 case []byte: 204 return Hash(v, 0) 205 case ByteSlice: 206 return Hash(v.Bytes(), 0) 207 default: 208 panic("invalid type for binarymemotable") 209 } 210 } 211 212 // helper function to append the given value to the builder regardless 213 // of the underlying binary type. 214 func (b *BinaryMemoTable) appendVal(val interface{}) { 215 switch v := val.(type) { 216 case string: 217 b.builder.AppendString(v) 218 case []byte: 219 b.builder.Append(v) 220 case ByteSlice: 221 b.builder.Append(v.Bytes()) 222 } 223 } 224 225 func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { 226 return b.tbl.Lookup(h, func(i int32) bool { 227 return bytes.Equal(val, b.builder.Value(int(i))) 228 }) 229 } 230 231 // Get returns the index of the specified value in the table or KeyNotFound, 232 // and a boolean indicating whether it was found in the table. 233 func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { 234 if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { 235 return int(p.payload.val), ok 236 } 237 return KeyNotFound, false 238 } 239 240 // GetOrInsertBytes returns the index of the given value in the table, if not found 241 // it is inserted into the table. The return value 'found' indicates whether the value 242 // was found in the table (true) or inserted (false) along with any possible error. 243 func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { 244 h := Hash(val, 0) 245 p, found := b.lookup(h, val) 246 if found { 247 idx = int(p.payload.val) 248 } else { 249 idx = b.Size() 250 b.builder.Append(val) 251 b.tbl.Insert(p, h, int32(idx), -1) 252 } 253 return 254 } 255 256 // GetOrInsert returns the index of the given value in the table, if not found 257 // it is inserted into the table. The return value 'found' indicates whether the value 258 // was found in the table (true) or inserted (false) along with any possible error. 259 func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 260 h := b.getHash(val) 261 p, found := b.lookup(h, b.valAsByteSlice(val)) 262 if found { 263 idx = int(p.payload.val) 264 } else { 265 idx = b.Size() 266 b.appendVal(val) 267 b.tbl.Insert(p, h, int32(idx), -1) 268 } 269 return 270 } 271 272 // GetOrInsertNull retrieves the index of a null in the table or inserts 273 // null into the table, returning the index and a boolean indicating if it was 274 // found in the table (true) or was inserted (false). 275 func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { 276 idx, found = b.GetNull() 277 if !found { 278 idx = b.Size() 279 b.nullIdx = idx 280 b.builder.AppendNull() 281 } 282 return 283 } 284 285 func (b *BinaryMemoTable) Value(i int) []byte { 286 return b.builder.Value(i) 287 } 288 289 // helper function to get the offset into the builder data for a given 290 // index value. 291 func (b *BinaryMemoTable) findOffset(idx int) uintptr { 292 if b.builder.DataLen() == 0 { 293 // only empty strings, short circuit 294 return 0 295 } 296 297 val := b.builder.Value(idx) 298 for len(val) == 0 { 299 idx++ 300 if idx >= b.builder.Len() { 301 break 302 } 303 val = b.builder.Value(idx) 304 } 305 if len(val) != 0 { 306 return uintptr(unsafe.Pointer(&val[0])) 307 } 308 return uintptr(b.builder.DataLen()) + b.findOffset(0) 309 } 310 311 // CopyOffsets copies the list of offsets into the passed in slice, the offsets 312 // being the start and end values of the underlying allocated bytes in the builder 313 // for the individual values of the table. out should be at least sized to Size()+1 314 func (b *BinaryMemoTable) CopyOffsets(out []int32) { 315 b.CopyOffsetsSubset(0, out) 316 } 317 318 // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, 319 // it gets a subset of the offsets in the table starting at the index provided by "start". 320 func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) { 321 if b.builder.Len() <= start { 322 return 323 } 324 325 first := b.findOffset(0) 326 delta := b.findOffset(start) 327 sz := b.Size() 328 for i := start; i < sz; i++ { 329 offset := int32(b.findOffset(i) - delta) 330 out[i-start] = offset 331 } 332 333 out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first))) 334 } 335 336 // CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets 337 // being the start and end values of the underlying allocated bytes in the builder 338 // for the individual values of the table. out should be at least sized to Size()+1 339 func (b *BinaryMemoTable) CopyLargeOffsets(out []int64) { 340 b.CopyLargeOffsetsSubset(0, out) 341 } 342 343 // CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, 344 // it gets a subset of the offsets in the table starting at the index provided by "start". 345 func (b *BinaryMemoTable) CopyLargeOffsetsSubset(start int, out []int64) { 346 if b.builder.Len() <= start { 347 return 348 } 349 350 first := b.findOffset(0) 351 delta := b.findOffset(start) 352 sz := b.Size() 353 for i := start; i < sz; i++ { 354 offset := int64(b.findOffset(i) - delta) 355 out[i-start] = offset 356 } 357 358 out[sz-start] = int64(b.builder.DataLen() - (int(delta) - int(first))) 359 } 360 361 // CopyValues copies the raw binary data bytes out, out should be a []byte 362 // with at least ValuesSize bytes allocated to copy into. 363 func (b *BinaryMemoTable) CopyValues(out interface{}) { 364 b.CopyValuesSubset(0, out) 365 } 366 367 // CopyValuesSubset copies the raw binary data bytes out starting with the value 368 // at the index start, out should be a []byte with at least ValuesSize bytes allocated 369 func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { 370 if b.builder.Len() <= start { 371 return 372 } 373 374 var ( 375 first = b.findOffset(0) 376 offset = b.findOffset(int(start)) 377 length = b.builder.DataLen() - int(offset-first) 378 ) 379 380 outval := out.([]byte) 381 copy(outval, b.builder.Value(start)[0:length]) 382 } 383 384 func (b *BinaryMemoTable) WriteOut(out []byte) { 385 b.CopyValues(out) 386 } 387 388 func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) { 389 b.CopyValuesSubset(start, out) 390 } 391 392 // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep 393 // track of the fixed width when inserting the null value the databuffer holds a 394 // zero length byte slice for the null value (if found) 395 func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { 396 if start >= b.Size() { 397 return 398 } 399 400 null, exists := b.GetNull() 401 if !exists || null < start { 402 // nothing to skip, proceed as usual 403 b.CopyValuesSubset(start, out) 404 return 405 } 406 407 var ( 408 leftOffset = b.findOffset(start) 409 nullOffset = b.findOffset(null) 410 leftSize = nullOffset - leftOffset 411 rightOffset = leftOffset + uintptr(b.ValuesSize()) 412 ) 413 414 if leftSize > 0 { 415 copy(out, b.builder.Value(start)[0:leftSize]) 416 } 417 418 rightSize := rightOffset - nullOffset 419 if rightSize > 0 { 420 // skip the null fixed size value 421 copy(out[int(leftSize)+width:], b.builder.Value(null + 1)[0:rightSize]) 422 } 423 } 424 425 // VisitValues exists to run the visitFn on each value currently in the hash table. 426 func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { 427 for i := int(start); i < b.Size(); i++ { 428 visitFn(b.builder.Value(i)) 429 } 430 } 431 432 // Release is used to tell the underlying builder that it can release the memory allocated 433 // when the reference count reaches 0, this is safe to be called from multiple goroutines 434 // simultaneously 435 func (b *BinaryMemoTable) Release() { b.builder.Release() } 436 437 // Retain increases the ref count, it is safe to call it from multiple goroutines 438 // simultaneously. 439 func (b *BinaryMemoTable) Retain() { b.builder.Retain() } 440 441 // ValuesSize returns the current total size of all the raw bytes that have been inserted 442 // into the memotable so far. 443 func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }