github.com/apache/arrow/go/v16@v16.1.0/internal/hashing/xxh3_memo_table.go (about) 1 // Licensed to the Apache Software Foundation (ASF) under one 2 // or more contributor license agreements. See the NOTICE file 3 // distributed with this work for additional information 4 // regarding copyright ownership. The ASF licenses this file 5 // to you under the Apache License, Version 2.0 (the 6 // "License"); you may not use this file except in compliance 7 // with the License. You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 // Package hashing provides utilities for and an implementation of a hash 18 // table which is more performant than the default go map implementation 19 // by leveraging xxh3 and some custom hash functions. 20 package hashing 21 22 import ( 23 "bytes" 24 "math" 25 "unsafe" 26 ) 27 28 //go:generate go run ../../arrow/_tools/tmpl/main.go -i -data=types.tmpldata xxh3_memo_table.gen.go.tmpl 29 30 type TypeTraits interface { 31 BytesRequired(n int) int 32 } 33 34 type ByteSlice interface { 35 Bytes() []byte 36 } 37 38 // MemoTable interface for hash tables and dictionary encoding. 39 // 40 // Values will remember the order they are inserted to generate a valid 41 // dictionary. 42 type MemoTable interface { 43 TypeTraits() TypeTraits 44 // Reset drops everything in the table allowing it to be reused 45 Reset() 46 // Size returns the current number of unique values stored in 47 // the table, including whether or not a null value has been 48 // inserted via GetOrInsertNull. 49 Size() int 50 // GetOrInsert returns the index of the table the specified value is, 51 // and a boolean indicating whether or not the value was found in 52 // the table (if false, the value was inserted). An error is returned 53 // if val is not the appropriate type for the table. 54 GetOrInsert(val interface{}) (idx int, existed bool, err error) 55 // GetOrInsertBytes returns the index of the table the specified value is, 56 // and a boolean indicating whether or not the value was found in 57 // the table (if false, the value was inserted). An error is returned 58 // if val is not the appropriate type for the table. This function is intended to be used by 59 // the BinaryMemoTable to prevent unnecessary allocations of the data when converting from a []byte to interface{}. 60 GetOrInsertBytes(val []byte) (idx int, existed bool, err error) 61 // GetOrInsertNull returns the index of the null value in the table, 62 // inserting one if it hasn't already been inserted. It returns a boolean 63 // indicating if the null value already existed or not in the table. 64 GetOrInsertNull() (idx int, existed bool) 65 // GetNull returns the index of the null value in the table, but does not 66 // insert one if it doesn't already exist. Will return -1 if it doesn't exist 67 // indicated by a false value for the boolean. 68 GetNull() (idx int, exists bool) 69 // WriteOut copies the unique values of the memotable out to the byte slice 70 // provided. Must have allocated enough bytes for all the values. 71 WriteOut(out []byte) 72 // WriteOutSubset is like WriteOut, but only writes a subset of values 73 // starting with the index offset. 74 WriteOutSubset(offset int, out []byte) 75 } 76 77 type NumericMemoTable interface { 78 MemoTable 79 WriteOutLE(out []byte) 80 WriteOutSubsetLE(offset int, out []byte) 81 } 82 83 const ( 84 sentinel uint64 = 0 85 loadFactor int64 = 2 86 ) 87 88 func max(a, b uint64) uint64 { 89 if a > b { 90 return a 91 } 92 return b 93 } 94 95 var isNan32Cmp = func(v float32) bool { return math.IsNaN(float64(v)) } 96 97 // KeyNotFound is the constant returned by memo table functions when a key isn't found in the table 98 const KeyNotFound = -1 99 100 type BinaryBuilderIFace interface { 101 Reserve(int) 102 ReserveData(int) 103 Retain() 104 Resize(int) 105 ResizeData(int) 106 Release() 107 DataLen() int 108 Value(int) []byte 109 Len() int 110 AppendNull() 111 AppendString(string) 112 Append([]byte) 113 } 114 115 // BinaryMemoTable is our hashtable for binary data using the BinaryBuilder 116 // to construct the actual data in an easy to pass around way with minimal copies 117 // while using a hash table to keep track of the indexes into the dictionary that 118 // is created as we go. 119 type BinaryMemoTable struct { 120 tbl *Int32HashTable 121 builder BinaryBuilderIFace 122 nullIdx int 123 } 124 125 // NewBinaryMemoTable returns a hash table for Binary data, the passed in allocator will 126 // be utilized for the BinaryBuilder, if nil then memory.DefaultAllocator will be used. 127 // initial and valuesize can be used to pre-allocate the table to reduce allocations. With 128 // initial being the initial number of entries to allocate for and valuesize being the starting 129 // amount of space allocated for writing the actual binary data. 130 func NewBinaryMemoTable(initial, valuesize int, bldr BinaryBuilderIFace) *BinaryMemoTable { 131 bldr.Reserve(int(initial)) 132 datasize := valuesize 133 if datasize <= 0 { 134 datasize = initial * 4 135 } 136 bldr.ReserveData(datasize) 137 return &BinaryMemoTable{tbl: NewInt32HashTable(uint64(initial)), builder: bldr, nullIdx: KeyNotFound} 138 } 139 140 type unimplementedtraits struct{} 141 142 func (unimplementedtraits) BytesRequired(int) int { panic("unimplemented") } 143 144 func (BinaryMemoTable) TypeTraits() TypeTraits { 145 return unimplementedtraits{} 146 } 147 148 // Reset dumps all of the data in the table allowing it to be reutilized. 149 func (s *BinaryMemoTable) Reset() { 150 s.tbl.Reset(32) 151 s.builder.Resize(0) 152 s.builder.ResizeData(0) 153 s.builder.Reserve(int(32)) 154 s.builder.ReserveData(int(32) * 4) 155 s.nullIdx = KeyNotFound 156 } 157 158 // GetNull returns the index of a null that has been inserted into the table or 159 // KeyNotFound. The bool returned will be true if there was a null inserted into 160 // the table, and false otherwise. 161 func (s *BinaryMemoTable) GetNull() (int, bool) { 162 return int(s.nullIdx), s.nullIdx != KeyNotFound 163 } 164 165 // Size returns the current size of the memo table including the null value 166 // if one has been inserted. 167 func (s *BinaryMemoTable) Size() int { 168 sz := int(s.tbl.size) 169 if _, ok := s.GetNull(); ok { 170 sz++ 171 } 172 return sz 173 } 174 175 // helper function to easily return a byte slice for any given value 176 // regardless of the type if it's a []byte, string, or fulfills the 177 // ByteSlice interface. 178 func (BinaryMemoTable) valAsByteSlice(val interface{}) []byte { 179 switch v := val.(type) { 180 case []byte: 181 return v 182 case ByteSlice: 183 return v.Bytes() 184 case string: 185 return strToBytes(v) 186 default: 187 panic("invalid type for binarymemotable") 188 } 189 } 190 191 // helper function to get the hash value regardless of the underlying binary type 192 func (BinaryMemoTable) getHash(val interface{}) uint64 { 193 switch v := val.(type) { 194 case string: 195 return hashString(v, 0) 196 case []byte: 197 return Hash(v, 0) 198 case ByteSlice: 199 return Hash(v.Bytes(), 0) 200 default: 201 panic("invalid type for binarymemotable") 202 } 203 } 204 205 // helper function to append the given value to the builder regardless 206 // of the underlying binary type. 207 func (b *BinaryMemoTable) appendVal(val interface{}) { 208 switch v := val.(type) { 209 case string: 210 b.builder.AppendString(v) 211 case []byte: 212 b.builder.Append(v) 213 case ByteSlice: 214 b.builder.Append(v.Bytes()) 215 } 216 } 217 218 func (b *BinaryMemoTable) lookup(h uint64, val []byte) (*entryInt32, bool) { 219 return b.tbl.Lookup(h, func(i int32) bool { 220 return bytes.Equal(val, b.builder.Value(int(i))) 221 }) 222 } 223 224 // Get returns the index of the specified value in the table or KeyNotFound, 225 // and a boolean indicating whether it was found in the table. 226 func (b *BinaryMemoTable) Get(val interface{}) (int, bool) { 227 if p, ok := b.lookup(b.getHash(val), b.valAsByteSlice(val)); ok { 228 return int(p.payload.val), ok 229 } 230 return KeyNotFound, false 231 } 232 233 // GetOrInsertBytes returns the index of the given value in the table, if not found 234 // it is inserted into the table. The return value 'found' indicates whether the value 235 // was found in the table (true) or inserted (false) along with any possible error. 236 func (b *BinaryMemoTable) GetOrInsertBytes(val []byte) (idx int, found bool, err error) { 237 h := Hash(val, 0) 238 p, found := b.lookup(h, val) 239 if found { 240 idx = int(p.payload.val) 241 } else { 242 idx = b.Size() 243 b.builder.Append(val) 244 b.tbl.Insert(p, h, int32(idx), -1) 245 } 246 return 247 } 248 249 // GetOrInsert returns the index of the given value in the table, if not found 250 // it is inserted into the table. The return value 'found' indicates whether the value 251 // was found in the table (true) or inserted (false) along with any possible error. 252 func (b *BinaryMemoTable) GetOrInsert(val interface{}) (idx int, found bool, err error) { 253 h := b.getHash(val) 254 p, found := b.lookup(h, b.valAsByteSlice(val)) 255 if found { 256 idx = int(p.payload.val) 257 } else { 258 idx = b.Size() 259 b.appendVal(val) 260 b.tbl.Insert(p, h, int32(idx), -1) 261 } 262 return 263 } 264 265 // GetOrInsertNull retrieves the index of a null in the table or inserts 266 // null into the table, returning the index and a boolean indicating if it was 267 // found in the table (true) or was inserted (false). 268 func (b *BinaryMemoTable) GetOrInsertNull() (idx int, found bool) { 269 idx, found = b.GetNull() 270 if !found { 271 idx = b.Size() 272 b.nullIdx = idx 273 b.builder.AppendNull() 274 } 275 return 276 } 277 278 func (b *BinaryMemoTable) Value(i int) []byte { 279 return b.builder.Value(i) 280 } 281 282 // helper function to get the offset into the builder data for a given 283 // index value. 284 func (b *BinaryMemoTable) findOffset(idx int) uintptr { 285 if b.builder.DataLen() == 0 { 286 // only empty strings, short circuit 287 return 0 288 } 289 290 val := b.builder.Value(idx) 291 for len(val) == 0 { 292 idx++ 293 if idx >= b.builder.Len() { 294 break 295 } 296 val = b.builder.Value(idx) 297 } 298 if len(val) != 0 { 299 return uintptr(unsafe.Pointer(&val[0])) 300 } 301 return uintptr(b.builder.DataLen()) + b.findOffset(0) 302 } 303 304 // CopyOffsets copies the list of offsets into the passed in slice, the offsets 305 // being the start and end values of the underlying allocated bytes in the builder 306 // for the individual values of the table. out should be at least sized to Size()+1 307 func (b *BinaryMemoTable) CopyOffsets(out []int32) { 308 b.CopyOffsetsSubset(0, out) 309 } 310 311 // CopyOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, 312 // it gets a subset of the offsets in the table starting at the index provided by "start". 313 func (b *BinaryMemoTable) CopyOffsetsSubset(start int, out []int32) { 314 if b.builder.Len() <= start { 315 return 316 } 317 318 first := b.findOffset(0) 319 delta := b.findOffset(start) 320 sz := b.Size() 321 for i := start; i < sz; i++ { 322 offset := int32(b.findOffset(i) - delta) 323 out[i-start] = offset 324 } 325 326 out[sz-start] = int32(b.builder.DataLen() - (int(delta) - int(first))) 327 } 328 329 // CopyLargeOffsets copies the list of offsets into the passed in slice, the offsets 330 // being the start and end values of the underlying allocated bytes in the builder 331 // for the individual values of the table. out should be at least sized to Size()+1 332 func (b *BinaryMemoTable) CopyLargeOffsets(out []int64) { 333 b.CopyLargeOffsetsSubset(0, out) 334 } 335 336 // CopyLargeOffsetsSubset is like CopyOffsets but instead of copying all of the offsets, 337 // it gets a subset of the offsets in the table starting at the index provided by "start". 338 func (b *BinaryMemoTable) CopyLargeOffsetsSubset(start int, out []int64) { 339 if b.builder.Len() <= start { 340 return 341 } 342 343 first := b.findOffset(0) 344 delta := b.findOffset(start) 345 sz := b.Size() 346 for i := start; i < sz; i++ { 347 offset := int64(b.findOffset(i) - delta) 348 out[i-start] = offset 349 } 350 351 out[sz-start] = int64(b.builder.DataLen() - (int(delta) - int(first))) 352 } 353 354 // CopyValues copies the raw binary data bytes out, out should be a []byte 355 // with at least ValuesSize bytes allocated to copy into. 356 func (b *BinaryMemoTable) CopyValues(out interface{}) { 357 b.CopyValuesSubset(0, out) 358 } 359 360 // CopyValuesSubset copies the raw binary data bytes out starting with the value 361 // at the index start, out should be a []byte with at least ValuesSize bytes allocated 362 func (b *BinaryMemoTable) CopyValuesSubset(start int, out interface{}) { 363 if b.builder.Len() <= start { 364 return 365 } 366 367 var ( 368 first = b.findOffset(0) 369 offset = b.findOffset(int(start)) 370 length = b.builder.DataLen() - int(offset-first) 371 ) 372 373 outval := out.([]byte) 374 copy(outval, b.builder.Value(start)[0:length]) 375 } 376 377 func (b *BinaryMemoTable) WriteOut(out []byte) { 378 b.CopyValues(out) 379 } 380 381 func (b *BinaryMemoTable) WriteOutSubset(start int, out []byte) { 382 b.CopyValuesSubset(start, out) 383 } 384 385 // CopyFixedWidthValues exists to cope with the fact that the table doesn't keep 386 // track of the fixed width when inserting the null value the databuffer holds a 387 // zero length byte slice for the null value (if found) 388 func (b *BinaryMemoTable) CopyFixedWidthValues(start, width int, out []byte) { 389 if start >= b.Size() { 390 return 391 } 392 393 null, exists := b.GetNull() 394 if !exists || null < start { 395 // nothing to skip, proceed as usual 396 b.CopyValuesSubset(start, out) 397 return 398 } 399 400 var ( 401 leftOffset = b.findOffset(start) 402 nullOffset = b.findOffset(null) 403 leftSize = nullOffset - leftOffset 404 rightOffset = leftOffset + uintptr(b.ValuesSize()) 405 ) 406 407 if leftSize > 0 { 408 copy(out, b.builder.Value(start)[0:leftSize]) 409 } 410 411 rightSize := rightOffset - nullOffset 412 if rightSize > 0 { 413 // skip the null fixed size value 414 copy(out[int(leftSize)+width:], b.builder.Value(null + 1)[0:rightSize]) 415 } 416 } 417 418 // VisitValues exists to run the visitFn on each value currently in the hash table. 419 func (b *BinaryMemoTable) VisitValues(start int, visitFn func([]byte)) { 420 for i := int(start); i < b.Size(); i++ { 421 visitFn(b.builder.Value(i)) 422 } 423 } 424 425 // Release is used to tell the underlying builder that it can release the memory allocated 426 // when the reference count reaches 0, this is safe to be called from multiple goroutines 427 // simultaneously 428 func (b *BinaryMemoTable) Release() { b.builder.Release() } 429 430 // Retain increases the ref count, it is safe to call it from multiple goroutines 431 // simultaneously. 432 func (b *BinaryMemoTable) Retain() { b.builder.Retain() } 433 434 // ValuesSize returns the current total size of all the raw bytes that have been inserted 435 // into the memotable so far. 436 func (b *BinaryMemoTable) ValuesSize() int { return b.builder.DataLen() }