github.com/zuoyebang/bitalosdb@v1.1.1-0.20240516111551-79a8c4d8ce20/internal/bindex/vector_index.go (about) 1 // Copyright 2021 The Bitalosdb author(hustxrb@163.com) and other contributors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bindex 16 17 import ( 18 "arena" 19 "encoding/binary" 20 "math" 21 "unsafe" 22 ) 23 24 type kv64 struct { 25 key uint32 26 value uint64 27 } 28 29 type kv32 struct { 30 key uint32 31 value uint32 32 } 33 34 type metadata [groupSize]int8 35 type group32 [groupSize]kv32 36 type group64 [groupSize]kv64 37 38 const ( 39 VectorVersion = 2 40 h1Mask uint32 = 0xffff_ff80 41 h2Mask uint32 = 0x0000_007f 42 empty int8 = -128 // 0b1000_0000 43 ) 44 45 type h1 uint32 46 47 type h2 int8 48 49 func splitHash(h uint32) (h1, h2) { 50 return h1((h & h1Mask) >> 7), h2(h & h2Mask) 51 } 52 53 func probeStart(hi h1, groups int) uint32 { 54 return uint32(hi) % uint32(groups) 55 } 56 57 type VectorValType uint8 58 59 const ( 60 VectorValTypeUint32 VectorValType = 4 61 VectorValTypeUint64 VectorValType = 8 62 ) 63 64 type VectorHeader struct { 65 version uint16 66 vtype VectorValType 67 shards uint32 68 } 69 70 type VectorIndex struct { 71 header VectorHeader 72 ctrl []metadata 73 groups32 []group32 74 groups64 []group64 75 resident uint32 76 limit uint32 77 count uint32 78 groupBytes uint32 79 saveGroupN uint32 80 data []byte 81 arena *arena.Arena 82 } 83 84 func NewVectorIndex() (m *VectorIndex) { 85 m = &VectorIndex{} 86 return 87 } 88 89 func (m *VectorIndex) InitWriter(sz uint32, vtype VectorValType) { 90 groups := numGroups(sz) 91 m.header = VectorHeader{ 92 version: VectorVersion, 93 vtype: vtype, 94 shards: groups, 95 } 96 m.ctrl = make([]metadata, groups) 97 m.limit = groups * maxAvgGroupLoad 98 m.groupBytes = groupSize * (5 + uint32(vtype)) 99 m.arena = arena.NewArena() 100 101 switch vtype { 102 case VectorValTypeUint32: 103 m.groups32 = make([]group32, groups) 104 case VectorValTypeUint64: 105 m.groups64 = make([]group64, groups) 106 } 107 for i := range m.ctrl { 108 m.ctrl[i] = newEmptyMetadata() 109 } 110 } 111 112 func (m *VectorIndex) SetReader(d []byte) bool { 113 if d == nil { 114 return false 115 } 116 117 m.data = d 118 m.header = readHeader(m.data) 119 if m.header.vtype != VectorValTypeUint32 && m.header.vtype != VectorValTypeUint64 { 120 return false 121 } 122 m.groupBytes = groupSize * (5 + uint32(m.header.vtype)) 123 return m.readMetadata() 124 } 125 126 func (m *VectorIndex) Get(key uint32) (any, bool) { 127 if m.header.vtype == VectorValTypeUint32 { 128 return m.Get32(key) 129 } else { 130 return m.Get64(key) 131 } 132 } 133 134 func (m *VectorIndex) innerMemGet32(key uint32) (value uint32, ok bool) { 135 hi, lo := splitHash(key) 136 g := probeStart(hi, len(m.groups32)) 137 for { 138 matches := metaMatchH2(&m.ctrl[g], lo) 139 for matches != 0 { 140 s := nextMatch(&matches) 141 if key == m.groups32[g][s].key { 142 value, ok = m.groups32[g][s].value, true 143 return 144 } 145 } 146 matches = metaMatchEmpty(&m.ctrl[g]) 147 if matches != 0 { 148 ok = false 149 return 150 } 151 g += 1 152 if g >= uint32(len(m.groups32)) { 153 g = 0 154 } 155 } 156 } 157 158 func (m *VectorIndex) innerMemGet64(key uint32) (value uint64, ok bool) { 159 hi, lo := splitHash(key) 160 g := probeStart(hi, len(m.groups64)) 161 for { 162 matches := metaMatchH2(&m.ctrl[g], lo) 163 for matches != 0 { 164 s := nextMatch(&matches) 165 if key == m.groups64[g][s].key { 166 value, ok = m.groups64[g][s].value, true 167 return 168 } 169 } 170 matches = metaMatchEmpty(&m.ctrl[g]) 171 if matches != 0 { 172 ok = false 173 return 174 } 175 g += 1 176 if g >= uint32(len(m.groups64)) { 177 g = 0 178 } 179 } 180 } 181 182 func (m *VectorIndex) rehash32(n uint32) { 183 groups, ctrl := m.groups32, m.ctrl 184 m.groups32 = make([]group32, n) 185 m.ctrl = make([]metadata, n) 186 for i := range m.ctrl { 187 m.ctrl[i] = newEmptyMetadata() 188 } 189 m.limit = n * maxAvgGroupLoad 190 m.resident = 0 191 for g := range ctrl { 192 for s := range ctrl[g] { 193 c := ctrl[g][s] 194 if c == empty { 195 continue 196 } 197 m.add32rehash(groups[g][s].key, groups[g][s].value) 198 } 199 } 200 } 201 202 func (m *VectorIndex) rehash64(n uint32) { 203 groups, ctrl := m.groups64, m.ctrl 204 m.groups64 = make([]group64, n) 205 m.ctrl = make([]metadata, n) 206 for i := range m.ctrl { 207 m.ctrl[i] = newEmptyMetadata() 208 } 209 m.limit = n * maxAvgGroupLoad 210 m.resident = 0 211 for g := range ctrl { 212 for s := range ctrl[g] { 213 c := ctrl[g][s] 214 if c == empty { 215 continue 216 } 217 m.add64rehash(groups[g][s].key, groups[g][s].value) 218 } 219 } 220 } 221 222 func (m *VectorIndex) add32rehash(key uint32, value uint32) { 223 hi, lo := splitHash(key) 224 g := probeStart(hi, len(m.groups32)) 225 for { 226 matches := metaMatchH2(&m.ctrl[g], lo) 227 for matches != 0 { 228 s := nextMatch(&matches) 229 if key == m.groups32[g][s].key { 230 return 231 } 232 } 233 matches = metaMatchEmpty(&m.ctrl[g]) 234 if matches != 0 { 235 s := nextMatch(&matches) 236 m.groups32[g][s].key = key 237 m.groups32[g][s].value = value 238 m.ctrl[g][s] = int8(lo) 239 m.resident++ 240 return 241 } 242 g += 1 243 if g >= uint32(len(m.groups32)) { 244 g = 0 245 } 246 } 247 } 248 249 func (m *VectorIndex) add64rehash(key uint32, value uint64) { 250 hi, lo := splitHash(key) 251 g := probeStart(hi, len(m.groups64)) 252 for { 253 matches := metaMatchH2(&m.ctrl[g], lo) 254 for matches != 0 { 255 s := nextMatch(&matches) 256 if key == m.groups64[g][s].key { 257 return 258 } 259 } 260 matches = metaMatchEmpty(&m.ctrl[g]) 261 if matches != 0 { 262 s := nextMatch(&matches) 263 m.groups64[g][s].key = key 264 m.groups64[g][s].value = value 265 m.ctrl[g][s] = int8(lo) 266 m.resident++ 267 return 268 } 269 g += 1 270 if g >= uint32(len(m.groups64)) { 271 g = 0 272 } 273 } 274 } 275 276 func (m *VectorIndex) Add32(key uint32, value uint32) { 277 if m.header.vtype != VectorValTypeUint32 { 278 return 279 } 280 if m.resident >= m.limit { 281 m.rehash32(uint32(math.Ceil(float64(len(m.groups32)) * 1.5))) 282 } 283 hi, lo := splitHash(key) 284 g := probeStart(hi, len(m.groups32)) 285 for { 286 matches := metaMatchH2(&m.ctrl[g], lo) 287 for matches != 0 { 288 s := nextMatch(&matches) 289 if key == m.groups32[g][s].key { 290 m.groups32[g][s].value = value 291 return 292 } 293 } 294 matches = metaMatchEmpty(&m.ctrl[g]) 295 if matches != 0 { 296 s := nextMatch(&matches) 297 m.groups32[g][s].key = key 298 m.groups32[g][s].value = value 299 m.ctrl[g][s] = int8(lo) 300 m.resident++ 301 return 302 } 303 g += 1 304 if g >= uint32(len(m.groups32)) { 305 g = 0 306 } 307 } 308 } 309 310 func (m *VectorIndex) Add64(key uint32, value uint64) { 311 if m.header.vtype != VectorValTypeUint64 { 312 return 313 } 314 if m.resident >= m.limit { 315 m.rehash64(uint32(math.Ceil(float64(len(m.groups64)) * 1.2))) 316 } 317 hi, lo := splitHash(key) 318 g := probeStart(hi, len(m.groups64)) 319 for { 320 matches := metaMatchH2(&m.ctrl[g], lo) 321 for matches != 0 { 322 s := nextMatch(&matches) 323 if key == m.groups64[g][s].key { 324 m.groups64[g][s].value = value 325 return 326 } 327 } 328 matches = metaMatchEmpty(&m.ctrl[g]) 329 if matches != 0 { 330 s := nextMatch(&matches) 331 m.groups64[g][s].key = key 332 m.groups64[g][s].value = value 333 m.ctrl[g][s] = int8(lo) 334 m.resident++ 335 return 336 } 337 g += 1 338 if g >= uint32(len(m.groups64)) { 339 g = 0 340 } 341 } 342 } 343 344 func (m *VectorIndex) Length() uint32 { 345 return m.resident 346 } 347 348 func (m *VectorIndex) Size() uint32 { 349 n := m.saveGroups() 350 // header+count+ctrl+groups(k+v) 351 return 12 + n*m.groupBytes 352 } 353 354 //go:inline 355 func (m *VectorIndex) calGroupHead(g uint32) uint32 { 356 return 12 + g*m.groupBytes 357 } 358 359 //go:inline 360 func (m *VectorIndex) calGroups(size uint32) uint32 { 361 // header+count+ctrl+groups(k+v) 362 return (size - 12) / m.groupBytes 363 } 364 365 func (m *VectorIndex) GetData() []byte { 366 return m.data 367 } 368 369 func (m *VectorIndex) Capacity() uint32 { 370 return m.limit - m.resident 371 } 372 373 //go:inline 374 func (m *VectorIndex) saveGroups() uint32 { 375 n := uint32(math.Ceil(float64(m.resident) / float64(maxAvgGroupLoad))) 376 cn := uint32(len(m.groups32)) 377 sub := cn - n 378 if sub > 100 || float32(sub)/float32(cn) > 0.25 { 379 return n 380 } 381 return cn 382 } 383 384 func (m *VectorIndex) Serialize() bool { 385 switch m.header.vtype { 386 case VectorValTypeUint32: 387 return m.Serialize32() 388 case VectorValTypeUint64: 389 return m.Serialize64() 390 default: 391 return false 392 } 393 } 394 395 func (m *VectorIndex) Serialize32() bool { 396 if m.resident <= 0 { 397 return false 398 } 399 m.saveGroupN = m.saveGroups() 400 if m.saveGroupN != uint32(len(m.ctrl)) { 401 m.rehash32(m.saveGroupN) 402 } 403 404 size := int(m.Size()) 405 406 if m.data == nil { 407 m.data = arena.MakeSlice[byte](m.arena, size, size) 408 } 409 writeHeader(m.data, m.header) 410 writeCount(m.data[8:], m.resident) 411 tail := 12 412 for g := range m.ctrl { 413 copy(m.data[tail:], (*[groupSize]byte)(unsafe.Pointer(&m.ctrl[g]))[:]) 414 tail += groupSize 415 for s := range m.groups32[g] { 416 if m.ctrl[g][s] != empty { 417 writeKV32(m.data[tail:], m.groups32[g][s]) 418 } 419 tail += 8 420 } 421 } 422 return true 423 } 424 425 func (m *VectorIndex) Serialize64() bool { 426 if m.resident <= 0 { 427 return false 428 } 429 m.saveGroupN = m.saveGroups() 430 if m.saveGroupN != uint32(len(m.ctrl)) { 431 m.rehash64(m.saveGroupN) 432 } 433 434 size := int(m.Size()) 435 436 if m.data == nil { 437 m.data = arena.MakeSlice[byte](m.arena, size, size) 438 } 439 writeHeader(m.data, m.header) 440 writeCount(m.data[8:], m.resident) 441 tail := 12 442 for g := range m.ctrl { 443 copy(m.data[tail:], (*[groupSize]byte)(unsafe.Pointer(&m.ctrl[g]))[:]) 444 tail += groupSize 445 for s := range m.groups64[g] { 446 if m.ctrl[g][s] != empty { 447 writeKV64(m.data[tail:], m.groups64[g][s]) 448 } 449 tail += 12 450 } 451 } 452 return true 453 } 454 455 func (m *VectorIndex) SetWriter(d []byte) bool { 456 if d == nil || len(d) < int(m.Size()) { 457 return false 458 } 459 460 m.data = d 461 462 return true 463 } 464 465 func (m *VectorIndex) readMetadata() bool { 466 m.count = readUint32(m.data[8:]) 467 if m.count == 0 { 468 return false 469 } 470 m.resident = m.count 471 m.limit = m.count 472 gs := m.calGroups(uint32(len(m.data))) 473 m.ctrl = make([]metadata, gs) 474 for i, _ := range m.ctrl { 475 m.ctrl[i] = *(*metadata)(unsafe.Pointer(&m.data[12+uint32(i)*m.groupBytes])) 476 } 477 return true 478 } 479 480 func (m *VectorIndex) Get32(key uint32) (value uint32, ok bool) { 481 hi, lo := splitHash(key) 482 g := probeStart(hi, len(m.ctrl)) 483 for { 484 matches := metaMatchH2(&m.ctrl[g], lo) 485 for matches != 0 { 486 s := nextMatch(&matches) 487 kIdx := m.calGroupHead(g) + groupSize + s*(4+uint32(m.header.vtype)) 488 k := readUint32(m.data[kIdx:]) 489 if key == k { 490 value, ok = readKV32Value(m.data[kIdx:]), true 491 return 492 } 493 } 494 matches = metaMatchEmpty(&m.ctrl[g]) 495 if matches != 0 { 496 ok = false 497 return 498 } 499 g += 1 500 if g >= uint32(len(m.ctrl)) { 501 g = 0 502 } 503 } 504 } 505 506 func (m *VectorIndex) Get64(key uint32) (value uint64, ok bool) { 507 hi, lo := splitHash(key) 508 g := probeStart(hi, len(m.ctrl)) 509 for { 510 matches := metaMatchH2(&m.ctrl[g], lo) 511 for matches != 0 { 512 s := nextMatch(&matches) 513 kIdx := m.calGroupHead(g) + groupSize + s*(4+uint32(m.header.vtype)) 514 k := readUint32(m.data[kIdx:]) 515 if key == k { 516 value, ok = readKV64Value(m.data[kIdx:]), true 517 return 518 } 519 } 520 matches = metaMatchEmpty(&m.ctrl[g]) 521 if matches != 0 { 522 ok = false 523 return 524 } 525 g += 1 526 if g >= uint32(len(m.ctrl)) { 527 g = 0 528 } 529 } 530 } 531 532 func (m *VectorIndex) Finish() { 533 m.groups64 = nil 534 m.groups32 = nil 535 if m.arena != nil { 536 m.arena.Free() 537 m.arena = nil 538 } 539 } 540 541 func numGroups(n uint32) (groups uint32) { 542 groups = (n + maxAvgGroupLoad - 1) / maxAvgGroupLoad 543 if groups == 0 { 544 groups = 1 545 } 546 return 547 } 548 549 func newEmptyMetadata() (meta metadata) { 550 for i := range meta { 551 meta[i] = empty 552 } 553 return 554 } 555 556 func writeHeader(buf []byte, header VectorHeader) { 557 binary.BigEndian.PutUint16(buf[0:], header.version) 558 binary.BigEndian.PutUint16(buf[2:], uint16(header.vtype)) 559 binary.BigEndian.PutUint32(buf[4:], header.shards) 560 } 561 562 func writeCount(buf []byte, count uint32) { 563 binary.BigEndian.PutUint32(buf[0:], count) 564 } 565 566 func writeKV32(buf []byte, item32 kv32) { 567 binary.BigEndian.PutUint32(buf[0:], item32.key) 568 binary.BigEndian.PutUint32(buf[4:], item32.value) 569 } 570 571 func writeKV64(buf []byte, item64 kv64) { 572 binary.BigEndian.PutUint32(buf[0:], item64.key) 573 binary.BigEndian.PutUint64(buf[4:], item64.value) 574 } 575 576 func readHeader(buf []byte) VectorHeader { 577 header := VectorHeader{ 578 version: binary.BigEndian.Uint16(buf[0:]), 579 vtype: VectorValType(binary.BigEndian.Uint16(buf[2:])), 580 shards: binary.BigEndian.Uint32(buf[4:]), 581 } 582 583 return header 584 } 585 586 func readUint32(buf []byte) uint32 { 587 return binary.BigEndian.Uint32(buf[0:]) 588 } 589 590 func readKV32Value(buf []byte) uint32 { 591 return binary.BigEndian.Uint32(buf[4:]) 592 } 593 594 func readKV64Value(buf []byte) uint64 { 595 return binary.BigEndian.Uint64(buf[4:]) 596 }