github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/bindex/hash_index.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bindex 16 17 import ( 18 "arena" 19 "encoding/binary" 20 "sort" 21 ) 22 23 const ( 24 MaxLowBit = 0xFFFF 25 HashIndexShardItemAvg = 1 << 8 26 HashIndexShardsNum = 64 << 10 27 HashIndexShardSize = 4 28 HashIndexItem32Size = 6 29 HashIndexItem64Size = 10 30 ) 31 32 type HashIndex struct { 33 header Header 34 size uint32 35 length uint32 36 itemOffset uint32 37 type32 bool 38 uniq bool 39 data []byte 40 data32 []FItem32Array 41 data64 []FItem64Array 42 arena *arena.Arena 43 } 44 45 type FItem32 struct { 46 key uint16 47 value uint32 48 } 49 50 type FItem64 struct { 51 key uint16 52 value uint64 53 } 54 55 type FItem32Array []FItem32 56 57 func (i32 FItem32Array) Len() int { 58 return len(i32) 59 } 60 61 func (i32 FItem32Array) Swap(i, j int) { 62 i32[i], i32[j] = i32[j], i32[i] 63 } 64 65 func (i32 FItem32Array) Less(i, j int) bool { 66 return i32[i].key < i32[j].key 67 } 68 69 type FItem64Array []FItem64 70 71 func (i64 FItem64Array) Len() int { 72 return len(i64) 73 } 74 75 func (i64 FItem64Array) Swap(i, j int) { 76 i64[i], i64[j] = i64[j], i64[i] 77 } 78 79 func (i64 FItem64Array) Less(i, j int) bool { 80 return i64[i].key < i64[j].key 81 } 82 83 func NewHashIndex(type32 bool) *HashIndex { 84 offset := uint32(SuccinctHeaderSize + HashIndexShardsNum*HashIndexShardSize) 85 86 m := &HashIndex{ 87 header: Header{version: SuccinctVersion, reserved: 0, shards: HashIndexShardsNum}, 88 size: offset, 89 length: 0, 90 itemOffset: offset, 91 type32: type32, 92 uniq: false, 93 data: nil, 94 data32: nil, 95 data64: nil, 96 arena: nil, 97 } 98 99 return m 100 } 101 102 func (s *HashIndex) Size() uint32 { 103 if !s.uniq { 104 s.Unique() 105 } 106 return s.size 107 } 108 109 func (s *HashIndex) Length() uint32 { 110 return s.length 111 } 112 113 func (s *HashIndex) GetData() []byte { 114 return s.data 115 } 116 117 func (s *HashIndex) SetReader(d []byte) bool { 118 if d == nil || len(d) <= int(s.itemOffset) { 119 return false 120 } 121 122 s.data = d 123 s.header = s.readHeader(s.data) 124 125 return true 126 } 127 128 func (s *HashIndex) InitWriter() { 129 s.arena = arena.NewArena() 130 131 if s.type32 { 132 s.data32 = arena.MakeSlice[FItem32Array](s.arena, int(s.header.shards), int(s.header.shards)) 133 } else { 134 s.data64 = arena.MakeSlice[FItem64Array](s.arena, int(s.header.shards), int(s.header.shards)) 135 } 136 } 137 138 func (s *HashIndex) SetWriter(d []byte) bool { 139 if d == nil || len(d) < int(s.size) || cap(d) < int(s.size) { 140 return false 141 } 142 143 s.data = d 144 145 return true 146 } 147 148 func (s *HashIndex) Add(key uint32, value any) { 149 switch value.(type) { 150 case uint32: 151 if s.type32 { 152 s.add32Internal(key, value.(uint32)) 153 } 154 return 155 case uint64: 156 if !s.type32 { 157 s.add64Internal(key, value.(uint64)) 158 } 159 return 160 default: 161 return 162 } 163 } 164 165 func (s *HashIndex) Unique() { 166 if s.uniq { 167 return 168 } 169 170 if s.type32 { 171 s.unique32Internal() 172 } else { 173 s.unique64Internal() 174 } 175 176 s.uniq = true 177 } 178 179 func (s *HashIndex) Serialize() bool { 180 if !s.uniq { 181 s.Unique() 182 } 183 184 if s.type32 { 185 return s.serialize32Internal() 186 } else { 187 return s.serialize64Internal() 188 } 189 } 190 191 func (s *HashIndex) Get(key uint32) (any, bool) { 192 if s.type32 { 193 return s.Get32(key) 194 } else { 195 return s.Get64(key) 196 } 197 } 198 199 func (s *HashIndex) add32Internal(key uint32, value uint32) { 200 if s.header.shards <= 0 { 201 return 202 } 203 204 hid := s.highbits(key) 205 lid := s.lowbits(key) 206 207 if len(s.data32[hid]) == 0 { 208 s.data32[hid] = arena.MakeSlice[FItem32](s.arena, 0, HashIndexShardItemAvg) 209 } 210 211 s.data32[hid] = append(s.data32[hid], FItem32{key: lid, value: value}) 212 213 s.size += HashIndexItem32Size 214 s.length++ 215 } 216 217 func (s *HashIndex) add64Internal(key uint32, value uint64) { 218 if s.header.shards <= 0 { 219 return 220 } 221 222 hid := s.highbits(key) 223 lid := s.lowbits(key) 224 225 if len(s.data64[hid]) == 0 { 226 s.data64[hid] = arena.MakeSlice[FItem64](s.arena, 0, HashIndexShardItemAvg) 227 } 228 229 s.data64[hid] = append(s.data64[hid], FItem64{key: lid, value: value}) 230 231 s.size += HashIndexItem64Size 232 s.length++ 233 } 234 235 func (s *HashIndex) unique32Internal() { 236 if s.size <= s.itemOffset || s.length <= 0 || len(s.data32) <= 0 { 237 return 238 } 239 240 for i := uint32(0); i < s.header.shards; i++ { 241 itemsLen := uint32(len(s.data32[i])) 242 if itemsLen > 1 { 243 sort.Sort(s.data32[i]) 244 245 uniqFlag := false 246 prevItem := int32(-1) 247 for j := uint32(0); j < itemsLen; j++ { 248 if prevItem == int32(s.data32[i][j].key) { 249 itemsLen-- 250 s.length-- 251 s.size -= HashIndexItem32Size 252 copy(s.data32[i][j:], s.data32[i][j+1:]) 253 uniqFlag = true 254 continue 255 } 256 257 prevItem = int32(s.data32[i][j].key) 258 } 259 260 if uniqFlag { 261 s.data32[i] = s.data32[i][0:itemsLen] 262 } 263 } 264 } 265 } 266 267 func (s *HashIndex) unique64Internal() { 268 if s.size <= s.itemOffset || s.length <= 0 || len(s.data64) <= 0 { 269 return 270 } 271 272 for i := uint32(0); i < s.header.shards; i++ { 273 itemsLen := uint32(len(s.data64[i])) 274 if itemsLen > 1 { 275 sort.Sort(s.data64[i]) 276 277 uniqFlag := false 278 prevItem := int32(-1) 279 for j := uint32(0); j < itemsLen; j++ { 280 if prevItem == int32(s.data64[i][j].key) { 281 itemsLen-- 282 s.length-- 283 s.size -= HashIndexItem64Size 284 copy(s.data64[i][j:], s.data64[i][j+1:]) 285 uniqFlag = true 286 continue 287 } 288 289 prevItem = int32(s.data64[i][j].key) 290 } 291 292 if uniqFlag { 293 s.data64[i] = s.data64[i][0:itemsLen] 294 } 295 } 296 } 297 } 298 299 func (s *HashIndex) serialize32Internal() bool { 300 if s.size <= s.itemOffset || s.length <= 0 || len(s.data32) <= 0 { 301 return false 302 } 303 304 shardOffset := uint32(0) 305 itemOffset := s.itemOffset 306 307 if s.data == nil { 308 s.data = arena.MakeSlice[byte](s.arena, int(s.size), int(s.size)) 309 } 310 311 s.writeHeader(s.data[shardOffset:], s.header) 312 shardOffset += SuccinctHeaderSize 313 314 totalCount := uint32(0) 315 for i := uint32(0); i < s.header.shards; i++ { 316 itemsLen := uint32(len(s.data32[i])) 317 totalCount += itemsLen 318 s.writeShard(s.data[shardOffset:], totalCount) 319 shardOffset += HashIndexShardSize 320 321 if itemsLen > 0 { 322 for j := uint32(0); j < itemsLen; j++ { 323 s.writeItem32(s.data[itemOffset:], s.data32[i][j]) 324 itemOffset += HashIndexItem32Size 325 } 326 } 327 } 328 329 return true 330 } 331 332 func (s *HashIndex) serialize64Internal() bool { 333 if s.size <= s.itemOffset || s.length <= 0 || len(s.data64) <= 0 { 334 return false 335 } 336 337 shardOffset := uint32(0) 338 itemOffset := s.itemOffset 339 340 if s.data == nil { 341 s.data = arena.MakeSlice[byte](s.arena, int(s.size), int(s.size)) 342 } 343 344 s.writeHeader(s.data[shardOffset:], s.header) 345 shardOffset += SuccinctHeaderSize 346 347 totalCount := uint32(0) 348 for i := uint32(0); i < s.header.shards; i++ { 349 itemsLen := uint32(len(s.data64[i])) 350 totalCount += itemsLen 351 s.writeShard(s.data[shardOffset:], totalCount) 352 shardOffset += HashIndexShardSize 353 354 if itemsLen > 0 { 355 for j := uint32(0); j < itemsLen; j++ { 356 s.writeItem64(s.data[itemOffset:], s.data64[i][j]) 357 itemOffset += HashIndexItem64Size 358 } 359 } 360 } 361 362 return true 363 } 364 365 func (s *HashIndex) Get32(key uint32) (uint32, bool) { 366 if len(s.data) <= int(s.itemOffset) || s.header.shards <= 0 { 367 return 0, false 368 } 369 370 hid := s.highbits(key) 371 lid := s.lowbits(key) 372 373 originCount := uint32(0) 374 if hid > 0 { 375 originOffset := uint32(SuccinctHeaderSize) + uint32(hid-1)*HashIndexShardSize 376 originCount = s.readShard(s.data[originOffset:]) 377 } 378 379 destOffset := uint32(SuccinctHeaderSize) + uint32(hid)*HashIndexShardSize 380 destCount := s.readShard(s.data[destOffset:]) 381 if destCount <= originCount { 382 return 0, false 383 } 384 385 itemLength := destCount - originCount 386 curOffset := s.itemOffset + originCount*HashIndexItem32Size 387 388 ok, idx := s.findItem(lid, s.data[curOffset:], int(itemLength), HashIndexItem32Size) 389 if !ok { 390 return 0, false 391 } 392 393 curOffset += uint32(idx * HashIndexItem32Size) 394 value := s.readItem32Value(s.data[curOffset:]) 395 396 return value, true 397 } 398 399 func (s *HashIndex) Get64(key uint32) (uint64, bool) { 400 if len(s.data) <= int(s.itemOffset) || s.header.shards <= 0 { 401 return 0, false 402 } 403 404 hid := s.highbits(key) 405 lid := s.lowbits(key) 406 407 originCount := uint32(0) 408 if hid > 0 { 409 originOffset := uint32(SuccinctHeaderSize) + uint32(hid-1)*HashIndexShardSize 410 originCount = s.readShard(s.data[originOffset:]) 411 } 412 413 destOffset := uint32(SuccinctHeaderSize) + uint32(hid)*HashIndexShardSize 414 destCount := s.readShard(s.data[destOffset:]) 415 if destCount <= originCount { 416 return 0, false 417 } 418 419 itemLength := destCount - originCount 420 curOffset := s.itemOffset + originCount*HashIndexItem64Size 421 422 ok, idx := s.findItem(lid, s.data[curOffset:], int(itemLength), HashIndexItem64Size) 423 if !ok { 424 return 0, false 425 } 426 427 curOffset += uint32(idx * HashIndexItem64Size) 428 value := s.readItem64Value(s.data[curOffset:]) 429 430 return value, true 431 } 432 433 func (s *HashIndex) Finish() { 434 s.size = SuccinctHeaderSize 435 s.length = 0 436 s.uniq = false 437 s.data32 = nil 438 s.data64 = nil 439 if s.arena != nil { 440 s.arena.Free() 441 s.arena = nil 442 } 443 } 444 445 func (s *HashIndex) writeHeader(buf []byte, header Header) { 446 binary.BigEndian.PutUint16(buf[0:], header.version) 447 binary.BigEndian.PutUint16(buf[2:], header.reserved) 448 binary.BigEndian.PutUint32(buf[4:], header.shards) 449 } 450 451 func (s *HashIndex) writeShard(buf []byte, count uint32) { 452 binary.BigEndian.PutUint32(buf[0:], count) 453 } 454 455 func (s *HashIndex) writeItem32(buf []byte, item32 FItem32) { 456 binary.BigEndian.PutUint16(buf[0:], item32.key) 457 binary.BigEndian.PutUint32(buf[2:], item32.value) 458 } 459 460 func (s *HashIndex) writeItem64(buf []byte, item64 FItem64) { 461 binary.BigEndian.PutUint16(buf[0:], item64.key) 462 binary.BigEndian.PutUint64(buf[2:], item64.value) 463 } 464 465 func (s *HashIndex) readHeader(buf []byte) Header { 466 header := Header{ 467 version: binary.BigEndian.Uint16(buf[0:]), 468 reserved: binary.BigEndian.Uint16(buf[2:]), 469 shards: binary.BigEndian.Uint32(buf[4:]), 470 } 471 472 return header 473 } 474 475 func (s *HashIndex) readShard(buf []byte) uint32 { 476 return binary.BigEndian.Uint32(buf[0:]) 477 } 478 479 func (s *HashIndex) readItem32Value(buf []byte) uint32 { 480 return binary.BigEndian.Uint32(buf[2:]) 481 } 482 483 func (s *HashIndex) readItem64Value(buf []byte) uint64 { 484 return binary.BigEndian.Uint64(buf[2:]) 485 } 486 487 func (s *HashIndex) findItem(key uint16, buf []byte, n int, step int) (bool, int) { 488 i, j := 0, n 489 for i < j { 490 h := int(uint(i+j) >> 1) 491 if binary.BigEndian.Uint16(buf[step*h:]) < key { 492 i = h + 1 493 } else { 494 j = h 495 } 496 } 497 498 if i < n && binary.BigEndian.Uint16(buf[step*i:]) == key { 499 return true, i 500 } 501 502 return false, 0 503 } 504 505 func (s *HashIndex) highbits(x uint32) uint16 { 506 return uint16(x >> 16) 507 } 508 509 func (s *HashIndex) lowbits(x uint32) uint16 { 510 return uint16(x & MaxLowBit) 511 }