github.com/dgraph-io/sroar@v0.0.0-20220527172339-b92b7eaaf6e0/container.go (about) 1 /* 2 * Copyright 2021 Dgraph Labs, Inc. and Contributors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package sroar 18 19 import ( 20 "fmt" 21 "math" 22 "math/bits" 23 "strings" 24 ) 25 26 // container uses extra 4 []uint16 in the front as header. 27 // container[0] is used for storing the size of the container, expressed in Uint16. 28 // The container size cannot exceed the vicinity of 8KB. At 8KB, we switch from packed arrays to 29 // bitmaps. We can fit the entire uint16 worth of bitmaps in 8KB (2^16 / 8 = 8 30 // KB). 31 32 const ( 33 typeArray uint16 = 0x00 34 typeBitmap uint16 = 0x01 35 36 // Container header. 37 indexSize int = 0 38 indexType int = 1 39 indexCardinality int = 2 40 // Index 2 and 3 is used for cardinality. We need 2 uint16s to store cardinality because 41 // 2^16 will not fit in uint16. 42 startIdx uint16 = 4 43 44 minContainerSize = 64 // In Uint16. 45 // Bitmap container can contain 2^16 integers. Each integer would use one bit to represent. 46 // Given that our data is represented in []uint16s, that'd mean the size of container to store 47 // it would be divided by 16. 48 // 4 for header and 4096 for storing bitmap container. In Uint16. 49 maxContainerSize = 4 + (1<<16)/16 50 ) 51 52 func dataAt(data []uint16, i int) uint16 { return data[int(startIdx)+i] } 53 54 func incrCardinality(data []uint16) { 55 cur := getCardinality(data) 56 if cur+1 > math.MaxUint16 { 57 data[indexCardinality+1] = 1 58 } else { 59 data[indexCardinality]++ 60 } 61 } 62 63 var invalidCardinality int = math.MaxUint16 + 10 64 var maxCardinality int = math.MaxUint16 + 1 65 66 func getCardinality(data []uint16) int { 67 // This sum has to be done using two ints to avoid overflow. 68 return int(data[indexCardinality]) + int(data[indexCardinality+1]) 69 } 70 71 func setCardinality(data []uint16, c int) { 72 if c > math.MaxUint16 { 73 data[indexCardinality] = math.MaxUint16 74 data[indexCardinality+1] = uint16(c - math.MaxUint16) 75 } else { 76 data[indexCardinality] = uint16(c) 77 data[indexCardinality+1] = 0 78 } 79 } 80 81 func zeroOutContainer(c []uint16) { 82 switch c[indexType] { 83 case typeArray: 84 array(c).zeroOut() 85 case typeBitmap: 86 bitmap(c).zeroOut() 87 } 88 } 89 90 func removeRangeContainer(c []uint16, lo, hi uint16) { 91 switch c[indexType] { 92 case typeArray: 93 array(c).removeRange(lo, hi) 94 case typeBitmap: 95 bitmap(c).removeRange(lo, hi) 96 } 97 } 98 99 func calculateAndSetCardinality(data []uint16) { 100 if data[indexType] != typeBitmap { 101 panic("Non-bitmap containers should always have cardinality set correctly") 102 } 103 b := bitmap(data) 104 card := b.cardinality() 105 setCardinality(b, card) 106 } 107 108 type array []uint16 109 110 // find returns the index of the first element >= x. 111 // The index is based on data portion of the container, ignoring startIdx. 112 // If the element > than all elements present, then N is returned where N = cardinality of the 113 // container. 114 func (c array) find(x uint16) int { 115 N := getCardinality(c) 116 for i := int(startIdx); i < int(startIdx)+N; i++ { 117 if len(c) <= int(i) { 118 panic(fmt.Sprintf("find: %d len(c) %d <= i %d\n", x, len(c), i)) 119 } 120 if c[i] >= x { 121 return int(i - int(startIdx)) 122 } 123 } 124 return N 125 } 126 127 func (c array) rank(x uint16) int { 128 N := getCardinality(c) 129 idx := c.find(x) 130 if idx == N { 131 return -1 132 } 133 return idx 134 } 135 136 func (c array) has(x uint16) bool { 137 N := getCardinality(c) 138 idx := c.find(x) 139 if idx == N { 140 return false 141 } 142 return c[int(startIdx)+idx] == x 143 } 144 145 func (c array) add(x uint16) bool { 146 idx := c.find(x) 147 N := getCardinality(c) 148 offset := int(startIdx) + idx 149 150 if int(idx) < N { 151 if c[offset] == x { 152 return false 153 } 154 // The entry at offset is the first entry, which is greater than x. Move it to the right. 155 copy(c[offset+1:], c[offset:]) 156 } 157 c[offset] = x 158 incrCardinality(c) 159 return true 160 } 161 162 func (c array) remove(x uint16) bool { 163 idx := c.find(x) 164 N := getCardinality(c) 165 offset := int(startIdx) + idx 166 167 if int(idx) < N { 168 if c[offset] != x { 169 return false 170 } 171 copy(c[offset:], c[offset+1:]) 172 setCardinality(c, N-1) 173 return true 174 } 175 return false 176 } 177 178 func (c array) removeRange(lo, hi uint16) { 179 if hi < lo { 180 panic(fmt.Sprintf("args must satisfy lo <= hi, got lo: %d, hi: %d\n", lo, hi)) 181 } 182 loIdx := c.find(lo) 183 hiIdx := c.find(hi) 184 185 st := int(startIdx) 186 loVal := c[st+loIdx] 187 N := getCardinality(c) 188 189 // remove range doesn't intersect with any element in the array. 190 if hi < loVal || loIdx == N { 191 return 192 } 193 if hiIdx == N { 194 if loIdx > 0 { 195 c = c[:int(startIdx)+loIdx-1] 196 } else { 197 c = c[:int(startIdx)] 198 } 199 setCardinality(c, loIdx) 200 return 201 } 202 if c[st+hiIdx] == hi { 203 hiIdx++ 204 } 205 copy(c[st+loIdx:], c[st+hiIdx:]) 206 setCardinality(c, N-hiIdx+loIdx) 207 } 208 209 func (c array) zeroOut() { 210 setCardinality(c, 0) 211 } 212 213 // TODO: Figure out how memory allocation would work in these situations. Perhaps use allocator here? 214 func (c array) andArray(other array) []uint16 { 215 min := min(getCardinality(c), getCardinality(other)) 216 217 setc := c.all() 218 seto := other.all() 219 220 out := make([]uint16, int(startIdx)+min+1) 221 num := uint16(intersection2by2(setc, seto, out[startIdx:])) 222 223 // Truncate out to how many values were found. 224 out = out[:startIdx+num+1] 225 out[indexType] = typeArray 226 out[indexSize] = uint16(len(out)) 227 setCardinality(out, int(num)) 228 return out 229 } 230 231 // TODO: We can do this operation in-place on the src array. 232 func (c array) andNotArray(other array, buf []uint16) []uint16 { 233 max := getCardinality(c) 234 out := make([]uint16, int(startIdx)+max+1) 235 236 andRes := array(c.andArray(other)).all() 237 srcVals := array(c).all() 238 num := uint16(difference(srcVals, andRes, out[startIdx:])) 239 240 // Truncate out to how many values were found. 241 out = out[:startIdx+num+1] 242 out[indexType] = typeArray 243 out[indexSize] = uint16(len(out)) 244 setCardinality(out, int(num)) 245 return out 246 } 247 248 func (c array) orArray(other array, buf []uint16, runMode int) []uint16 { 249 // We ignore runInline for this call. 250 251 max := getCardinality(c) + getCardinality(other) 252 if max > 4096 { 253 // Use bitmap container. 254 out := bitmap(c.toBitmapContainer(buf)) 255 // For now, just keep it as a bitmap. No need to change if the 256 // cardinality is smaller than 4096. 257 out.orArray(other, nil, runMode|runInline) 258 // Return out because out is pointing to buf. This would allow the 259 // receiver to copy out. 260 return out 261 } 262 263 // The output would be of typeArray. 264 out := buf[:int(startIdx)+max] 265 num := union2by2(c.all(), other.all(), out[startIdx:]) 266 out[indexType] = typeArray 267 out[indexSize] = uint16(len(out)) 268 setCardinality(out, num) 269 return out 270 } 271 272 var tmp = make([]uint16, 8192) 273 274 func (c array) andBitmap(other bitmap) []uint16 { 275 out := make([]uint16, int(startIdx)+getCardinality(c)+2) // some extra space. 276 out[indexType] = typeArray 277 278 pos := startIdx 279 for _, x := range c.all() { 280 out[pos] = x 281 pos += other.bitValue(x) 282 } 283 284 // Ensure we have at least one empty slot at the end. 285 res := out[:pos+1] 286 res[indexSize] = uint16(len(res)) 287 setCardinality(res, int(pos-startIdx)) 288 return res 289 } 290 291 // TODO: Write an optmized version of this function. 292 func (c array) andNotBitmap(other bitmap, buf []uint16) []uint16 { 293 assert(len(buf) == maxContainerSize) 294 res := array(buf) 295 Memclr(res) 296 res[indexSize] = 4 297 for _, e := range c.all() { 298 if !other.has(e) { 299 res.add(e) 300 } 301 } 302 return res 303 } 304 305 func (c array) isFull() bool { 306 N := getCardinality(c) 307 return int(startIdx)+N >= len(c) 308 } 309 310 func (c array) all() []uint16 { 311 N := getCardinality(c) 312 return c[startIdx : int(startIdx)+N] 313 } 314 315 func (c array) minimum() uint16 { 316 N := getCardinality(c) 317 if N == 0 { 318 return 0 319 } 320 return c[startIdx] 321 } 322 323 func (c array) maximum() uint16 { 324 N := getCardinality(c) 325 if N == 0 { 326 return 0 327 } 328 return c[int(startIdx)+N-1] 329 } 330 331 func (c array) toBitmapContainer(buf []uint16) []uint16 { 332 if len(buf) == 0 { 333 buf = make([]uint16, maxContainerSize) 334 } else { 335 assert(len(buf) == maxContainerSize) 336 assert(len(buf) == copy(buf, empty)) 337 } 338 339 b := bitmap(buf) 340 b[indexSize] = maxContainerSize 341 b[indexType] = typeBitmap 342 setCardinality(b, getCardinality(c)) 343 344 data := b[startIdx:] 345 for _, x := range c.all() { 346 idx := x >> 4 347 pos := x & 0xF 348 data[idx] |= bitmapMask[pos] 349 } 350 return b 351 } 352 353 func (c array) String() string { 354 var b strings.Builder 355 b.WriteString(fmt.Sprintf("Size: %d\n", c[0])) 356 for i, val := range c[startIdx:] { 357 b.WriteString(fmt.Sprintf("%d: %d\n", i, val)) 358 } 359 return b.String() 360 } 361 362 type bitmap []uint16 363 364 var bitmapMask []uint16 365 366 func init() { 367 bitmapMask = make([]uint16, 16) 368 for i := 0; i < 16; i++ { 369 bitmapMask[i] = 1 << (15 - i) 370 } 371 } 372 373 func (b bitmap) add(x uint16) bool { 374 idx := x >> 4 375 pos := x & 0xF 376 377 if has := b[startIdx+idx] & bitmapMask[pos]; has > 0 { 378 return false 379 } 380 381 b[startIdx+idx] |= bitmapMask[pos] 382 incrCardinality(b) 383 return true 384 } 385 386 func (b bitmap) remove(x uint16) bool { 387 idx := x >> 4 388 pos := x & 0xF 389 390 c := getCardinality(b) 391 if has := b[startIdx+idx] & bitmapMask[pos]; has > 0 { 392 b[startIdx+idx] ^= bitmapMask[pos] 393 setCardinality(b, c-1) 394 return true 395 } 396 return false 397 } 398 399 func (b bitmap) removeRange(lo, hi uint16) { 400 loIdx := lo >> 4 401 loPos := lo & 0xF 402 403 hiIdx := hi >> 4 404 hiPos := hi & 0xF 405 406 N := getCardinality(b) 407 var removed int 408 for i := loIdx + 1; i < hiIdx; i++ { 409 removed += bits.OnesCount16(b[startIdx+i]) 410 b[startIdx+i] = 0 411 } 412 413 if loIdx == hiIdx { 414 for p := loPos; p <= hiPos; p++ { 415 if b[startIdx+loIdx]&bitmapMask[p] > 0 { 416 removed++ 417 } 418 b[startIdx+loIdx] &= ^bitmapMask[p] 419 } 420 setCardinality(b, N-removed) 421 return 422 } 423 for p := loPos; p < 1<<4; p++ { 424 if b[startIdx+loIdx]&bitmapMask[p] > 0 { 425 removed++ 426 } 427 b[startIdx+loIdx] &= ^bitmapMask[p] 428 } 429 for p := uint16(0); p <= hiPos; p++ { 430 if b[startIdx+hiIdx]&bitmapMask[p] > 0 { 431 removed++ 432 } 433 b[startIdx+hiIdx] &= ^bitmapMask[p] 434 } 435 setCardinality(b, N-removed) 436 } 437 438 func (b bitmap) has(x uint16) bool { 439 idx := x >> 4 440 pos := x & 0xF 441 has := b[startIdx+idx] & bitmapMask[pos] 442 return has > 0 443 } 444 445 func (b bitmap) rank(x uint16) int { 446 idx := x >> 4 447 pos := x & 0xF 448 if b[startIdx+idx]&bitmapMask[pos] == 0 { 449 return -1 450 } 451 452 var rank int 453 for i := 0; i < int(idx); i++ { 454 rank += bits.OnesCount16(b[int(startIdx)+i]) 455 } 456 for p := uint16(0); p <= pos; p++ { 457 if b[startIdx+idx]&bitmapMask[p] > 0 { 458 rank++ 459 } 460 } 461 return rank - 1 462 } 463 464 // TODO: This can perhaps be using SIMD instructions. 465 func (b bitmap) andBitmap(other bitmap) []uint16 { 466 out := make([]uint16, maxContainerSize) 467 out[indexSize] = maxContainerSize 468 out[indexType] = typeBitmap 469 var num int 470 for i := int(startIdx); i < len(b); i++ { 471 out[i] = b[i] & other[i] 472 num += bits.OnesCount16(out[i]) 473 } 474 setCardinality(out, num) 475 return out 476 } 477 478 func (b bitmap) orBitmap(other bitmap, buf []uint16, runMode int) []uint16 { 479 if runMode&runInline > 0 { 480 buf = b 481 } else { 482 copy(buf, b) // Copy over first. 483 } 484 buf[indexSize] = maxContainerSize 485 buf[indexType] = typeBitmap 486 487 if num := getCardinality(b); num == maxCardinality { 488 // do nothing. bitmap is already full. 489 490 } else if runMode&runLazy > 0 || num == invalidCardinality { 491 data := buf[startIdx:] 492 for i, v := range other[startIdx:] { 493 data[i] |= v 494 } 495 setCardinality(buf, invalidCardinality) 496 497 } else { 498 var num int 499 data := buf[startIdx:] 500 for i, v := range other[startIdx:] { 501 data[i] |= v 502 // We are going to iterate over the entire container. So, we can 503 // just recount the cardinality, starting from num=0. 504 num += bits.OnesCount16(data[i]) 505 } 506 setCardinality(buf, num) 507 } 508 if runMode&runInline > 0 { 509 return nil 510 } 511 return buf 512 } 513 514 func (b bitmap) andNotBitmap(other bitmap) []uint16 { 515 var num int 516 data := b[startIdx:] 517 for i, v := range other[startIdx:] { 518 data[i] = data[i] ^ (data[i] & v) 519 num += bits.OnesCount16(data[i]) 520 } 521 setCardinality(b, num) 522 return b 523 } 524 525 func (b bitmap) andNotArray(other array) []uint16 { 526 for _, e := range other.all() { 527 b.remove(e) 528 } 529 return b 530 } 531 532 func (b bitmap) orArray(other array, buf []uint16, runMode int) []uint16 { 533 if runMode&runInline > 0 { 534 buf = b 535 } else { 536 copy(buf, b) 537 } 538 539 if num := getCardinality(b); num == maxCardinality { 540 // do nothing. This bitmap is already full. 541 542 } else if runMode&runLazy > 0 || num == invalidCardinality { 543 // Avoid calculating the cardinality to speed up operations. 544 for _, x := range other.all() { 545 idx := x / 16 546 pos := x % 16 547 548 buf[startIdx+idx] |= bitmapMask[pos] 549 } 550 setCardinality(buf, invalidCardinality) 551 552 } else { 553 num := getCardinality(buf) 554 for _, x := range other.all() { 555 idx := x / 16 556 pos := x % 16 557 558 val := &buf[4+idx] 559 before := bits.OnesCount16(*val) 560 *val |= bitmapMask[pos] 561 after := bits.OnesCount16(*val) 562 num += after - before 563 } 564 setCardinality(buf, num) 565 } 566 567 if runMode&runInline > 0 { 568 return nil 569 } 570 return buf 571 } 572 573 func (b bitmap) all() []uint16 { 574 var res []uint16 575 data := b[startIdx:] 576 for idx := uint16(0); idx < uint16(len(data)); idx++ { 577 x := data[idx] 578 // TODO: This could potentially be optimized. 579 for pos := uint16(0); pos < 16; pos++ { 580 if x&bitmapMask[pos] > 0 { 581 res = append(res, (idx<<4)|pos) 582 } 583 } 584 } 585 return res 586 } 587 588 //TODO: It can be optimized. 589 func (b bitmap) selectAt(idx int) uint16 { 590 data := b[startIdx:] 591 n := uint16(len(data)) 592 for i := uint16(0); i < n; i++ { 593 x := data[i] 594 c := bits.OnesCount16(x) 595 if idx < c { 596 for pos := uint16(0); pos < 16; pos++ { 597 if idx == 0 && x&bitmapMask[pos] > 0 { 598 return i*16 + pos 599 } 600 if x&bitmapMask[pos] > 0 { 601 idx-- 602 } 603 } 604 605 } 606 idx -= c 607 } 608 panic("should not reach here") 609 } 610 611 // bitValue returns a 0 or a 1 depending upon whether x is present in the bitmap, where 1 means 612 // present and 0 means absent. 613 func (b bitmap) bitValue(x uint16) uint16 { 614 idx := x >> 4 615 return (b[4+idx] >> (15 - (x & 0xF))) & 1 616 } 617 618 func (b bitmap) isFull() bool { 619 return false 620 } 621 622 func (b bitmap) minimum() uint16 { 623 N := getCardinality(b) 624 if N == 0 { 625 return 0 626 } 627 for i, x := range b[startIdx:] { 628 lz := bits.LeadingZeros16(x) 629 if lz == 16 { 630 continue 631 } 632 return uint16(16*i + lz) 633 } 634 panic("We shouldn't reach here") 635 } 636 637 func (b bitmap) maximum() uint16 { 638 N := getCardinality(b) 639 if N == 0 { 640 return 0 641 } 642 for i := len(b) - 1; i >= int(startIdx); i-- { 643 x := b[i] 644 tz := bits.TrailingZeros16(x) 645 if tz == 16 { 646 continue 647 } 648 return uint16(16*i + 15 - tz) 649 } 650 panic("We shouldn't reach here") 651 } 652 653 func (b bitmap) cardinality() int { 654 var num int 655 for _, x := range b[startIdx:] { 656 num += bits.OnesCount16(x) 657 } 658 return num 659 } 660 661 var zeroContainer = make([]uint16, maxContainerSize) 662 663 func (b bitmap) zeroOut() { 664 setCardinality(b, 0) 665 copy(b[startIdx:], zeroContainer[startIdx:]) 666 } 667 668 var ( 669 runInline = 0x01 670 runLazy = 0x02 671 ) 672 673 func containerOr(ac, bc, buf []uint16, runMode int) []uint16 { 674 at := ac[indexType] 675 bt := bc[indexType] 676 677 if at == typeArray && bt == typeArray { 678 left := array(ac) 679 right := array(bc) 680 // We can't always inline this function. If the right container has 681 // enough entries, trying to do a union with the left container inplace 682 // could end up overwriting the left container entries. So, we use a 683 // buffer to hold all output, and then copy it over to left. 684 // 685 // TODO: If right doesn't have a lot of entries, we could just iterate 686 // over left and merge the entries from right inplace. Would be faster 687 // than copying over all entries into buffer. Worth trying that approach. 688 return left.orArray(right, buf, runMode) 689 } 690 if at == typeArray && bt == typeBitmap { 691 left := array(ac) 692 right := bitmap(bc) 693 // Don't run inline for this call. 694 return right.orArray(left, buf, runMode&^runInline) 695 } 696 697 // These two following cases can be fully inlined. 698 if at == typeBitmap && bt == typeArray { 699 left := bitmap(ac) 700 right := array(bc) 701 return left.orArray(right, buf, runMode) 702 } 703 if at == typeBitmap && bt == typeBitmap { 704 left := bitmap(ac) 705 right := bitmap(bc) 706 return left.orBitmap(right, buf, runMode) 707 } 708 panic("containerOr: We should not reach here") 709 } 710 711 func containerAnd(ac, bc []uint16) []uint16 { 712 at := ac[indexType] 713 bt := bc[indexType] 714 715 if at == typeArray && bt == typeArray { 716 left := array(ac) 717 right := array(bc) 718 return left.andArray(right) 719 } 720 if at == typeArray && bt == typeBitmap { 721 left := array(ac) 722 right := bitmap(bc) 723 return left.andBitmap(right) 724 } 725 if at == typeBitmap && bt == typeArray { 726 left := bitmap(ac) 727 right := array(bc) 728 out := right.andBitmap(left) 729 return out 730 } 731 if at == typeBitmap && bt == typeBitmap { 732 left := bitmap(ac) 733 right := bitmap(bc) 734 return left.andBitmap(right) 735 } 736 panic("containerAnd: We should not reach here") 737 } 738 739 // TODO: Optimize this function. 740 func containerAndNot(ac, bc, buf []uint16) []uint16 { 741 at := ac[indexType] 742 bt := bc[indexType] 743 744 if at == typeArray && bt == typeArray { 745 left := array(ac) 746 right := array(bc) 747 return left.andNotArray(right, buf) 748 } 749 if at == typeArray && bt == typeBitmap { 750 left := array(ac) 751 right := bitmap(bc) 752 return left.andNotBitmap(right, buf) 753 } 754 if at == typeBitmap && bt == typeArray { 755 left := bitmap(ac) 756 right := array(bc) 757 out := left.andNotArray(right) 758 return out 759 } 760 if at == typeBitmap && bt == typeBitmap { 761 left := bitmap(ac) 762 right := bitmap(bc) 763 return left.andNotBitmap(right) 764 } 765 panic("containerAndNot: We should not reach here") 766 }