github.com/weaviate/sroar@v0.0.0-20230210105426-26108af5465d/container.go (about) 1 /* 2 * Copyright 2021 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package sroar 18 19 import ( 20 "fmt" 21 "math" 22 "math/bits" 23 "strings" 24 ) 25 26 // container uses extra 4 []uint16 in the front as header. 27 // container[0] is used for storing the size of the container, expressed in Uint16. 28 // The container size cannot exceed the vicinity of 8KB. At 8KB, we switch from packed arrays to 29 // bitmaps. We can fit the entire uint16 worth of bitmaps in 8KB (2^16 / 8 = 8 30 // KB). 31 32 const ( 33 typeArray uint16 = 0x00 34 typeBitmap uint16 = 0x01 35 36 // Container header. 37 indexSize int = 0 38 indexType int = 1 39 indexCardinality int = 2 40 // Index 2 and 3 is used for cardinality. We need 2 uint16s to store cardinality because 41 // 2^16 will not fit in uint16. 42 startIdx uint16 = 4 43 44 minContainerSize = 64 // In Uint16. 45 // Bitmap container can contain 2^16 integers. Each integer would use one bit to represent. 46 // Given that our data is represented in []uint16s, that'd mean the size of container to store 47 // it would be divided by 16. 48 // 4 for header and 4096 for storing bitmap container. In Uint16. 49 maxContainerSize = 4 + (1<<16)/16 50 ) 51 52 func dataAt(data []uint16, i int) uint16 { return data[int(startIdx)+i] } 53 54 func incrCardinality(data []uint16) { 55 cur := getCardinality(data) 56 if cur+1 > math.MaxUint16 { 57 data[indexCardinality+1] = 1 58 } else { 59 data[indexCardinality]++ 60 } 61 } 62 63 var invalidCardinality int = math.MaxUint16 + 10 64 var maxCardinality int = math.MaxUint16 + 1 65 66 func getCardinality(data []uint16) int { 67 // This sum has to be done using two ints to avoid overflow. 68 return int(data[indexCardinality]) + int(data[indexCardinality+1]) 69 } 70 71 func setCardinality(data []uint16, c int) { 72 if c > math.MaxUint16 { 73 data[indexCardinality] = math.MaxUint16 74 data[indexCardinality+1] = uint16(c - math.MaxUint16) 75 } else { 76 data[indexCardinality] = uint16(c) 77 data[indexCardinality+1] = 0 78 } 79 } 80 81 func zeroOutContainer(c []uint16) { 82 switch c[indexType] { 83 case typeArray: 84 array(c).zeroOut() 85 case typeBitmap: 86 bitmap(c).zeroOut() 87 } 88 } 89 90 func removeRangeContainer(c []uint16, lo, hi uint16) { 91 switch c[indexType] { 92 case typeArray: 93 array(c).removeRange(lo, hi) 94 case typeBitmap: 95 bitmap(c).removeRange(lo, hi) 96 } 97 } 98 99 func calculateAndSetCardinality(data []uint16) { 100 if data[indexType] != typeBitmap { 101 panic("Non-bitmap containers should always have cardinality set correctly") 102 } 103 b := bitmap(data) 104 card := b.cardinality() 105 setCardinality(b, card) 106 } 107 108 type array []uint16 109 110 // find returns the index of the first element >= x. 111 // The index is based on data portion of the container, ignoring startIdx. 112 // If the element > than all elements present, then N is returned where N = cardinality of the 113 // container. 114 func (c array) find(x uint16) int { 115 N := getCardinality(c) 116 for i := int(startIdx); i < int(startIdx)+N; i++ { 117 if len(c) <= int(i) { 118 panic(fmt.Sprintf("find: %d len(c) %d <= i %d\n", x, len(c), i)) 119 } 120 if c[i] >= x { 121 return int(i - int(startIdx)) 122 } 123 } 124 return N 125 } 126 127 func (c array) rank(x uint16) int { 128 N := getCardinality(c) 129 idx := c.find(x) 130 if idx == N { 131 return -1 132 } 133 return idx 134 } 135 136 func (c array) has(x uint16) bool { 137 N := getCardinality(c) 138 idx := c.find(x) 139 if idx == N { 140 return false 141 } 142 return c[int(startIdx)+idx] == x 143 } 144 145 func (c array) add(x uint16) bool { 146 idx := c.find(x) 147 N := getCardinality(c) 148 offset := int(startIdx) + idx 149 150 if int(idx) < N { 151 if c[offset] == x { 152 return false 153 } 154 // The entry at offset is the first entry, which is greater than x. Move it to the right. 155 copy(c[offset+1:], c[offset:]) 156 } 157 c[offset] = x 158 incrCardinality(c) 159 return true 160 } 161 162 func (c array) remove(x uint16) bool { 163 idx := c.find(x) 164 N := getCardinality(c) 165 offset := int(startIdx) + idx 166 167 if int(idx) < N { 168 if c[offset] != x { 169 return false 170 } 171 copy(c[offset:], c[offset+1:]) 172 setCardinality(c, N-1) 173 return true 174 } 175 return false 176 } 177 178 func (c array) removeRange(lo, hi uint16) { 179 if hi < lo { 180 panic(fmt.Sprintf("args must satisfy lo <= hi, got lo: %d, hi: %d\n", lo, hi)) 181 } 182 loIdx := c.find(lo) 183 hiIdx := c.find(hi) 184 185 st := int(startIdx) 186 loVal := c[st+loIdx] 187 N := getCardinality(c) 188 189 // remove range doesn't intersect with any element in the array. 190 if hi < loVal || loIdx == N { 191 return 192 } 193 if hiIdx == N { 194 if loIdx > 0 { 195 c = c[:int(startIdx)+loIdx-1] 196 } else { 197 c = c[:int(startIdx)] 198 } 199 setCardinality(c, loIdx) 200 return 201 } 202 if c[st+hiIdx] == hi { 203 hiIdx++ 204 } 205 copy(c[st+loIdx:], c[st+hiIdx:]) 206 setCardinality(c, N-hiIdx+loIdx) 207 } 208 209 func (c array) zeroOut() { 210 setCardinality(c, 0) 211 } 212 213 // TODO: Figure out how memory allocation would work in these situations. Perhaps use allocator here? 214 func (c array) andArray(other array) []uint16 { 215 min := min(getCardinality(c), getCardinality(other)) 216 217 setc := c.all() 218 seto := other.all() 219 220 out := make([]uint16, int(startIdx)+min+1) 221 num := uint16(intersection2by2(setc, seto, out[startIdx:])) 222 223 // Truncate out to how many values were found. 224 out = out[:startIdx+num+1] 225 out[indexType] = typeArray 226 out[indexSize] = uint16(len(out)) 227 setCardinality(out, int(num)) 228 return out 229 } 230 231 // TODO: We can do this operation in-place on the src array. 232 func (c array) andNotArray(other array, buf []uint16) []uint16 { 233 max := getCardinality(c) 234 out := make([]uint16, int(startIdx)+max+1) 235 236 andRes := array(c.andArray(other)).all() 237 srcVals := array(c).all() 238 num := uint16(difference(srcVals, andRes, out[startIdx:])) 239 240 // Truncate out to how many values were found. 241 out = out[:startIdx+num+1] 242 out[indexType] = typeArray 243 out[indexSize] = uint16(len(out)) 244 setCardinality(out, int(num)) 245 return out 246 } 247 248 func (c array) orArray(other array, buf []uint16, runMode int) []uint16 { 249 // We ignore runInline for this call. 250 251 max := getCardinality(c) + getCardinality(other) 252 if max > 4096 { 253 // Use bitmap container. 254 out := bitmap(c.toBitmapContainer(buf)) 255 // For now, just keep it as a bitmap. No need to change if the 256 // cardinality is smaller than 4096. 257 out.orArray(other, nil, runMode|runInline) 258 // Return out because out is pointing to buf. This would allow the 259 // receiver to copy out. 260 return out 261 } 262 263 // The output would be of typeArray. 264 out := buf[:int(startIdx)+max] 265 num := union2by2(c.all(), other.all(), out[startIdx:]) 266 out[indexType] = typeArray 267 out[indexSize] = uint16(len(out)) 268 setCardinality(out, num) 269 return out 270 } 271 272 var tmp = make([]uint16, 8192) 273 274 func (c array) andBitmap(other bitmap) []uint16 { 275 out := make([]uint16, int(startIdx)+getCardinality(c)+2) // some extra space. 276 out[indexType] = typeArray 277 278 pos := startIdx 279 for _, x := range c.all() { 280 out[pos] = x 281 pos += other.bitValue(x) 282 } 283 284 // Ensure we have at least one empty slot at the end. 285 res := out[:pos+1] 286 res[indexSize] = uint16(len(res)) 287 setCardinality(res, int(pos-startIdx)) 288 return res 289 } 290 291 // TODO: Write an optmized version of this function. 292 func (c array) andNotBitmap(other bitmap, buf []uint16) []uint16 { 293 assert(len(buf) == maxContainerSize) 294 res := array(buf) 295 Memclr(res) 296 res[indexSize] = 4 297 for _, e := range c.all() { 298 if !other.has(e) { 299 if res.add(e) { 300 res[indexSize]++ 301 } 302 } 303 } 304 return res 305 } 306 307 func (c array) isFull() bool { 308 N := getCardinality(c) 309 return int(startIdx)+N >= len(c) 310 } 311 312 func (c array) all() []uint16 { 313 N := getCardinality(c) 314 return c[startIdx : int(startIdx)+N] 315 } 316 317 func (c array) minimum() uint16 { 318 N := getCardinality(c) 319 if N == 0 { 320 return 0 321 } 322 return c[startIdx] 323 } 324 325 func (c array) maximum() uint16 { 326 N := getCardinality(c) 327 if N == 0 { 328 return 0 329 } 330 return c[int(startIdx)+N-1] 331 } 332 333 func (c array) toBitmapContainer(buf []uint16) []uint16 { 334 if len(buf) == 0 { 335 buf = make([]uint16, maxContainerSize) 336 } else { 337 assert(len(buf) == maxContainerSize) 338 assert(len(buf) == copy(buf, empty)) 339 } 340 341 b := bitmap(buf) 342 b[indexSize] = maxContainerSize 343 b[indexType] = typeBitmap 344 setCardinality(b, getCardinality(c)) 345 346 data := b[startIdx:] 347 for _, x := range c.all() { 348 idx := x >> 4 349 pos := x & 0xF 350 data[idx] |= bitmapMask[pos] 351 } 352 return b 353 } 354 355 func (c array) String() string { 356 var b strings.Builder 357 b.WriteString(fmt.Sprintf("Size: %d\n", c[0])) 358 for i, val := range c[startIdx:] { 359 b.WriteString(fmt.Sprintf("%d: %d\n", i, val)) 360 } 361 return b.String() 362 } 363 364 type bitmap []uint16 365 366 var bitmapMask []uint16 367 368 func init() { 369 bitmapMask = make([]uint16, 16) 370 for i := 0; i < 16; i++ { 371 bitmapMask[i] = 1 << (15 - i) 372 } 373 } 374 375 func (b bitmap) add(x uint16) bool { 376 idx := x >> 4 377 pos := x & 0xF 378 379 if has := b[startIdx+idx] & bitmapMask[pos]; has > 0 { 380 return false 381 } 382 383 b[startIdx+idx] |= bitmapMask[pos] 384 incrCardinality(b) 385 return true 386 } 387 388 func (b bitmap) remove(x uint16) bool { 389 idx := x >> 4 390 pos := x & 0xF 391 392 c := getCardinality(b) 393 if has := b[startIdx+idx] & bitmapMask[pos]; has > 0 { 394 b[startIdx+idx] ^= bitmapMask[pos] 395 setCardinality(b, c-1) 396 return true 397 } 398 return false 399 } 400 401 func (b bitmap) removeRange(lo, hi uint16) { 402 loIdx := lo >> 4 403 loPos := lo & 0xF 404 405 hiIdx := hi >> 4 406 hiPos := hi & 0xF 407 408 N := getCardinality(b) 409 var removed int 410 for i := loIdx + 1; i < hiIdx; i++ { 411 removed += bits.OnesCount16(b[startIdx+i]) 412 b[startIdx+i] = 0 413 } 414 415 if loIdx == hiIdx { 416 for p := loPos; p <= hiPos; p++ { 417 if b[startIdx+loIdx]&bitmapMask[p] > 0 { 418 removed++ 419 } 420 b[startIdx+loIdx] &= ^bitmapMask[p] 421 } 422 setCardinality(b, N-removed) 423 return 424 } 425 for p := loPos; p < 1<<4; p++ { 426 if b[startIdx+loIdx]&bitmapMask[p] > 0 { 427 removed++ 428 } 429 b[startIdx+loIdx] &= ^bitmapMask[p] 430 } 431 for p := uint16(0); p <= hiPos; p++ { 432 if b[startIdx+hiIdx]&bitmapMask[p] > 0 { 433 removed++ 434 } 435 b[startIdx+hiIdx] &= ^bitmapMask[p] 436 } 437 setCardinality(b, N-removed) 438 } 439 440 func (b bitmap) has(x uint16) bool { 441 idx := x >> 4 442 pos := x & 0xF 443 has := b[startIdx+idx] & bitmapMask[pos] 444 return has > 0 445 } 446 447 func (b bitmap) rank(x uint16) int { 448 idx := x >> 4 449 pos := x & 0xF 450 if b[startIdx+idx]&bitmapMask[pos] == 0 { 451 return -1 452 } 453 454 var rank int 455 for i := 0; i < int(idx); i++ { 456 rank += bits.OnesCount16(b[int(startIdx)+i]) 457 } 458 for p := uint16(0); p <= pos; p++ { 459 if b[startIdx+idx]&bitmapMask[p] > 0 { 460 rank++ 461 } 462 } 463 return rank - 1 464 } 465 466 // TODO: This can perhaps be using SIMD instructions. 467 func (b bitmap) andBitmap(other bitmap) []uint16 { 468 out := make([]uint16, maxContainerSize) 469 out[indexSize] = maxContainerSize 470 out[indexType] = typeBitmap 471 var num int 472 for i := int(startIdx); i < len(b); i++ { 473 out[i] = b[i] & other[i] 474 num += bits.OnesCount16(out[i]) 475 } 476 setCardinality(out, num) 477 return out 478 } 479 480 func (b bitmap) orBitmap(other bitmap, buf []uint16, runMode int) []uint16 { 481 if runMode&runInline > 0 { 482 buf = b 483 } else { 484 copy(buf, b) // Copy over first. 485 } 486 buf[indexSize] = maxContainerSize 487 buf[indexType] = typeBitmap 488 489 if num := getCardinality(b); num == maxCardinality { 490 // do nothing. bitmap is already full. 491 492 } else if runMode&runLazy > 0 || num == invalidCardinality { 493 data := buf[startIdx:] 494 for i, v := range other[startIdx:] { 495 data[i] |= v 496 } 497 setCardinality(buf, invalidCardinality) 498 499 } else { 500 var num int 501 data := buf[startIdx:] 502 for i, v := range other[startIdx:] { 503 data[i] |= v 504 // We are going to iterate over the entire container. So, we can 505 // just recount the cardinality, starting from num=0. 506 num += bits.OnesCount16(data[i]) 507 } 508 setCardinality(buf, num) 509 } 510 if runMode&runInline > 0 { 511 return nil 512 } 513 return buf 514 } 515 516 func (b bitmap) andNotBitmap(other bitmap) []uint16 { 517 var num int 518 data := b[startIdx:] 519 for i, v := range other[startIdx:] { 520 data[i] = data[i] ^ (data[i] & v) 521 num += bits.OnesCount16(data[i]) 522 } 523 setCardinality(b, num) 524 return b 525 } 526 527 func (b bitmap) andNotArray(other array) []uint16 { 528 for _, e := range other.all() { 529 b.remove(e) 530 } 531 return b 532 } 533 534 func (b bitmap) orArray(other array, buf []uint16, runMode int) []uint16 { 535 if runMode&runInline > 0 { 536 buf = b 537 } else { 538 copy(buf, b) 539 } 540 541 if num := getCardinality(b); num == maxCardinality { 542 // do nothing. This bitmap is already full. 543 544 } else if runMode&runLazy > 0 || num == invalidCardinality { 545 // Avoid calculating the cardinality to speed up operations. 546 for _, x := range other.all() { 547 idx := x / 16 548 pos := x % 16 549 550 buf[startIdx+idx] |= bitmapMask[pos] 551 } 552 setCardinality(buf, invalidCardinality) 553 554 } else { 555 num := getCardinality(buf) 556 for _, x := range other.all() { 557 idx := x / 16 558 pos := x % 16 559 560 val := &buf[4+idx] 561 before := bits.OnesCount16(*val) 562 *val |= bitmapMask[pos] 563 after := bits.OnesCount16(*val) 564 num += after - before 565 } 566 setCardinality(buf, num) 567 } 568 569 if runMode&runInline > 0 { 570 return nil 571 } 572 return buf 573 } 574 575 func (b bitmap) all() []uint16 { 576 var res []uint16 577 data := b[startIdx:] 578 for idx := uint16(0); idx < uint16(len(data)); idx++ { 579 x := data[idx] 580 // TODO: This could potentially be optimized. 581 for pos := uint16(0); pos < 16; pos++ { 582 if x&bitmapMask[pos] > 0 { 583 res = append(res, (idx<<4)|pos) 584 } 585 } 586 } 587 return res 588 } 589 590 //TODO: It can be optimized. 591 func (b bitmap) selectAt(idx int) uint16 { 592 data := b[startIdx:] 593 n := uint16(len(data)) 594 for i := uint16(0); i < n; i++ { 595 x := data[i] 596 c := bits.OnesCount16(x) 597 if idx < c { 598 for pos := uint16(0); pos < 16; pos++ { 599 if idx == 0 && x&bitmapMask[pos] > 0 { 600 return i*16 + pos 601 } 602 if x&bitmapMask[pos] > 0 { 603 idx-- 604 } 605 } 606 607 } 608 idx -= c 609 } 610 panic("should not reach here") 611 } 612 613 // bitValue returns a 0 or a 1 depending upon whether x is present in the bitmap, where 1 means 614 // present and 0 means absent. 615 func (b bitmap) bitValue(x uint16) uint16 { 616 idx := x >> 4 617 return (b[4+idx] >> (15 - (x & 0xF))) & 1 618 } 619 620 func (b bitmap) isFull() bool { 621 return false 622 } 623 624 func (b bitmap) minimum() uint16 { 625 N := getCardinality(b) 626 if N == 0 { 627 return 0 628 } 629 for i, x := range b[startIdx:] { 630 lz := bits.LeadingZeros16(x) 631 if lz == 16 { 632 continue 633 } 634 return uint16(16*i + lz) 635 } 636 panic("We shouldn't reach here") 637 } 638 639 func (b bitmap) maximum() uint16 { 640 N := getCardinality(b) 641 if N == 0 { 642 return 0 643 } 644 for i := len(b) - 1; i >= int(startIdx); i-- { 645 x := b[i] 646 tz := bits.TrailingZeros16(x) 647 if tz == 16 { 648 continue 649 } 650 return uint16(16*i + 15 - tz) 651 } 652 panic("We shouldn't reach here") 653 } 654 655 func (b bitmap) cardinality() int { 656 var num int 657 for _, x := range b[startIdx:] { 658 num += bits.OnesCount16(x) 659 } 660 return num 661 } 662 663 var zeroContainer = make([]uint16, maxContainerSize) 664 665 func (b bitmap) zeroOut() { 666 setCardinality(b, 0) 667 copy(b[startIdx:], zeroContainer[startIdx:]) 668 } 669 670 var ( 671 runInline = 0x01 672 runLazy = 0x02 673 ) 674 675 func containerOr(ac, bc, buf []uint16, runMode int) []uint16 { 676 at := ac[indexType] 677 bt := bc[indexType] 678 679 if at == typeArray && bt == typeArray { 680 left := array(ac) 681 right := array(bc) 682 // We can't always inline this function. If the right container has 683 // enough entries, trying to do a union with the left container inplace 684 // could end up overwriting the left container entries. So, we use a 685 // buffer to hold all output, and then copy it over to left. 686 // 687 // TODO: If right doesn't have a lot of entries, we could just iterate 688 // over left and merge the entries from right inplace. Would be faster 689 // than copying over all entries into buffer. Worth trying that approach. 690 return left.orArray(right, buf, runMode) 691 } 692 if at == typeArray && bt == typeBitmap { 693 left := array(ac) 694 right := bitmap(bc) 695 // Don't run inline for this call. 696 return right.orArray(left, buf, runMode&^runInline) 697 } 698 699 // These two following cases can be fully inlined. 700 if at == typeBitmap && bt == typeArray { 701 left := bitmap(ac) 702 right := array(bc) 703 return left.orArray(right, buf, runMode) 704 } 705 if at == typeBitmap && bt == typeBitmap { 706 left := bitmap(ac) 707 right := bitmap(bc) 708 return left.orBitmap(right, buf, runMode) 709 } 710 panic("containerOr: We should not reach here") 711 } 712 713 func containerAnd(ac, bc []uint16) []uint16 { 714 at := ac[indexType] 715 bt := bc[indexType] 716 717 if at == typeArray && bt == typeArray { 718 left := array(ac) 719 right := array(bc) 720 return left.andArray(right) 721 } 722 if at == typeArray && bt == typeBitmap { 723 left := array(ac) 724 right := bitmap(bc) 725 return left.andBitmap(right) 726 } 727 if at == typeBitmap && bt == typeArray { 728 left := bitmap(ac) 729 right := array(bc) 730 out := right.andBitmap(left) 731 return out 732 } 733 if at == typeBitmap && bt == typeBitmap { 734 left := bitmap(ac) 735 right := bitmap(bc) 736 return left.andBitmap(right) 737 } 738 panic("containerAnd: We should not reach here") 739 } 740 741 // TODO: Optimize this function. 742 func containerAndNot(ac, bc, buf []uint16) []uint16 { 743 at := ac[indexType] 744 bt := bc[indexType] 745 746 if at == typeArray && bt == typeArray { 747 left := array(ac) 748 right := array(bc) 749 return left.andNotArray(right, buf) 750 } 751 if at == typeArray && bt == typeBitmap { 752 left := array(ac) 753 right := bitmap(bc) 754 return left.andNotBitmap(right, buf) 755 } 756 if at == typeBitmap && bt == typeArray { 757 left := bitmap(ac) 758 right := array(bc) 759 out := left.andNotArray(right) 760 return out 761 } 762 if at == typeBitmap && bt == typeBitmap { 763 left := bitmap(ac) 764 right := bitmap(bc) 765 return left.andNotBitmap(right) 766 } 767 panic("containerAndNot: We should not reach here") 768 }