github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/segmentTermEnumFrame.go (about) 1 package blocktree 2 3 import ( 4 "bytes" 5 "fmt" 6 . "github.com/balzaczyy/golucene/core/codec/spi" 7 . "github.com/balzaczyy/golucene/core/index/model" 8 "github.com/balzaczyy/golucene/core/store" 9 "github.com/balzaczyy/golucene/core/util" 10 "github.com/balzaczyy/golucene/core/util/fst" 11 ) 12 13 type segmentTermsEnumFrame struct { 14 // Our index in stack[]: 15 ord int 16 17 hasTerms bool 18 hasTermsOrig bool 19 isFloor bool 20 21 arc *fst.Arc 22 23 // File pointer where this block was loaded from 24 fp int64 25 fpOrig int64 26 fpEnd int64 27 28 suffixBytes []byte 29 suffixesReader store.ByteArrayDataInput 30 31 statBytes []byte 32 statsReader store.ByteArrayDataInput 33 34 floorData []byte 35 floorDataReader store.ByteArrayDataInput 36 37 // Length of prefix shared by all terms in this block 38 prefix int 39 40 // Number of entries (term or sub-block) in this block 41 entCount int 42 43 // Which term we will next read, or -1 if the block 44 // isn't loaded yet 45 nextEnt int 46 47 // True if this block is either not a floor block, 48 // or, it's the last sub-block of a floor block 49 isLastInFloor bool 50 51 // True if all entries are terms 52 isLeafBlock bool 53 54 lastSubFP int64 55 56 nextFloorLabel int 57 numFollowFloorBlocks int 58 59 // Next term to decode metaData; we decode metaData 60 // lazily so that scanning to find the matching term is 61 // fast and only if you find a match and app wants the 62 // stats or docs/positions enums, will we decode the 63 // metaData 64 metaDataUpto int 65 66 state *BlockTermState 67 68 // metadata buffer, holding monotonic values 69 longs []int64 70 // metadata buffer, holding general values 71 bytes []byte 72 bytesReader *store.ByteArrayDataInput 73 74 ste *SegmentTermsEnum 75 76 startBytePos int 77 suffix int 78 subCode int64 79 } 80 81 func newFrame(ste *SegmentTermsEnum, ord int) *segmentTermsEnumFrame { 82 f := &segmentTermsEnumFrame{ 83 suffixBytes: make([]byte, 128), 84 statBytes: make([]byte, 64), 85 floorData: make([]byte, 32), 86 ste: ste, 87 ord: ord, 88 longs: make([]int64, ste.fr.longsSize), 89 } 90 f.state = ste.fr.parent.postingsReader.NewTermState() 91 f.state.TotalTermFreq = -1 92 return f 93 } 94 95 func (f *segmentTermsEnumFrame) setFloorData(in *store.ByteArrayDataInput, source []byte) { 96 numBytes := len(source) - (in.Pos - 0) 97 if numBytes > len(f.floorData) { 98 // TODO over allocate 99 f.floorData = make([]byte, numBytes) 100 } 101 copy(f.floorData, source[in.Pos:]) 102 f.floorDataReader.Reset(f.floorData) 103 f.numFollowFloorBlocks, _ = asInt(f.floorDataReader.ReadVInt()) 104 b, _ := f.floorDataReader.ReadByte() 105 f.nextFloorLabel = int(b) 106 // fmt.Printf(" setFloorData fpOrig=%v bytes=%v numFollowFloorBlocks=%v nextFloorLabel=%x\n", 107 // f.fpOrig, source[in.Pos:], f.numFollowFloorBlocks, f.nextFloorLabel) 108 } 109 110 func (f *segmentTermsEnumFrame) getTermBlockOrd() int { 111 if f.isLeafBlock { 112 return f.nextEnt 113 } else { 114 return f.state.TermBlockOrd 115 } 116 } 117 118 /* Does initial decode of next block of terms; this 119 doesn't actually decode the docFreq, totalTermFreq, 120 postings details (frq/prx offset, etc.) metadata; 121 it just loads them as byte[] blobs which are then 122 decoded on-demand if the metadata is ever requested 123 for any term in this block. This enables terms-only 124 intensive consumes (eg certain MTQs, respelling) to 125 not pay the price of decoding metadata they won't 126 use. */ 127 func (f *segmentTermsEnumFrame) loadBlock() (err error) { 128 // Clone the IndexInput lazily, so that consumers 129 // that just pull a TermsEnum to 130 // seekExact(TermState) don't pay this cost: 131 f.ste.initIndexInput() 132 133 if f.nextEnt != -1 { 134 // Already loaded 135 return 136 } 137 138 f.ste.in.Seek(f.fp) 139 code, err := asInt(f.ste.in.ReadVInt()) 140 if err != nil { 141 return err 142 } 143 f.entCount = int(uint(code) >> 1) 144 assert(f.entCount > 0) 145 f.isLastInFloor = (code & 1) != 0 146 147 assert2(f.arc == nil || !f.isLastInFloor || !f.isFloor, 148 "fp=%v arc=%v isFloor=%v isLastInFloor=%v", 149 f.fp, f.arc, f.isFloor, f.isLastInFloor) 150 151 // TODO: if suffixes were stored in random-access 152 // array structure, then we could do binary search 153 // instead of linear scan to find target term; eg 154 // we could have simple array of offsets 155 156 // term suffixes: 157 code, err = asInt(f.ste.in.ReadVInt()) 158 if err != nil { 159 return err 160 } 161 f.isLeafBlock = (code & 1) != 0 162 numBytes := int(uint(code) >> 1) 163 if len(f.suffixBytes) < numBytes { 164 f.suffixBytes = make([]byte, numBytes) 165 } 166 err = f.ste.in.ReadBytes(f.suffixBytes[:numBytes]) 167 if err != nil { 168 return err 169 } 170 f.suffixesReader.Reset(f.suffixBytes) 171 172 // if f.arc == nil { 173 // fmt.Printf(" loadBlock (next) fp=%v entCount=%v prefixLen=%v isLastInFloor=%v leaf?=%v\n", 174 // f.fp, f.entCount, f.prefix, f.isLastInFloor, f.isLeafBlock) 175 // } else { 176 // fmt.Printf(" loadBlock (seek) fp=%v entCount=%v prefixLen=%v hasTerms?=%v isFloor?=%v isLastInFloor=%v leaf?=%v\n", 177 // f.fp, f.entCount, f.prefix, f.hasTerms, f.isFloor, f.isLastInFloor, f.isLeafBlock) 178 // } 179 180 // stats 181 numBytes, err = asInt(f.ste.in.ReadVInt()) 182 if err != nil { 183 return err 184 } 185 if len(f.statBytes) < numBytes { 186 f.statBytes = make([]byte, numBytes) 187 } 188 err = f.ste.in.ReadBytes(f.statBytes[:numBytes]) 189 if err != nil { 190 return err 191 } 192 f.statsReader.Reset(f.statBytes) 193 f.metaDataUpto = 0 194 195 f.state.TermBlockOrd = 0 196 f.nextEnt = 0 197 f.lastSubFP = -1 198 199 // TODO: we could skip this if !hasTerms; but 200 // that's rare so won't help much 201 // metadata 202 if numBytes, err = asInt(f.ste.in.ReadVInt()); err != nil { 203 return err 204 } 205 if f.bytes == nil { 206 f.bytes = make([]byte, util.Oversize(numBytes, 1)) 207 f.bytesReader = store.NewEmptyByteArrayDataInput() 208 } else if len(f.bytes) < numBytes { 209 f.bytes = make([]byte, util.Oversize(numBytes, 1)) 210 } 211 if err = f.ste.in.ReadBytes(f.bytes[:numBytes]); err != nil { 212 return err 213 } 214 f.bytesReader.Reset(f.bytes) 215 216 // Sub-blocks of a single floor block are always 217 // written one after another -- tail recurse: 218 f.fpEnd = f.ste.in.FilePointer() 219 // fmt.Printf(" fpEnd=%v\n", f.fpEnd) 220 return nil 221 } 222 223 func (f *segmentTermsEnumFrame) rewind() { 224 // Force reload: 225 f.fp = f.fpOrig 226 f.nextEnt = -1 227 f.hasTerms = f.hasTermsOrig 228 if f.isFloor { 229 f.floorDataReader.Rewind() 230 f.numFollowFloorBlocks, _ = asInt(f.floorDataReader.ReadVInt()) 231 assert(f.numFollowFloorBlocks > 0) 232 b, _ := f.floorDataReader.ReadByte() 233 f.nextFloorLabel = int(b) 234 } 235 } 236 237 func (f *segmentTermsEnumFrame) next() bool { 238 if f.isLeafBlock { 239 return f.nextLeaf() 240 } 241 return f.nextNonLeaf() 242 } 243 244 // Decodes next entry; returns true if it's a sub-block 245 func (f *segmentTermsEnumFrame) nextLeaf() bool { 246 panic("not implemented yet") 247 } 248 249 func (f *segmentTermsEnumFrame) nextNonLeaf() bool { 250 panic("not implemented yet") 251 } 252 253 // TODO: make this array'd so we can do bin search? 254 // likely not worth it? need to measure how many 255 // floor blocks we "typically" get 256 func (f *segmentTermsEnumFrame) scanToFloorFrame(target []byte) { 257 if !f.isFloor || len(target) <= f.prefix { 258 // fmt.Printf(" scanToFloorFrame skip: isFloor=%v target.length=%v vs prefix=%v\n", 259 // f.isFloor, len(target), f.prefix) 260 return 261 } 262 263 targetLabel := int(target[f.prefix]) 264 fmt.Printf(" scanToFloorFrame fpOrig=%v targetLabel=%x vs nextFloorLabel=%x numFollowFloorBlocks=%v\n", 265 f.fpOrig, targetLabel, f.nextFloorLabel, f.numFollowFloorBlocks) 266 if targetLabel < f.nextFloorLabel { 267 fmt.Println(" already on correct block") 268 return 269 } 270 271 assert(f.numFollowFloorBlocks != 0) 272 273 var newFP int64 274 for { 275 code, _ := f.floorDataReader.ReadVLong() // ignore error 276 newFP = f.fpOrig + int64(uint64(code)>>1) 277 f.hasTerms = (code & 1) != 0 278 // fmt.Printf(" label=%x fp=%v hasTerms?=%v numFollorFloor=%v\n", 279 // f.nextFloorLabel, newFP, f.hasTerms, f.numFollowFloorBlocks) 280 281 f.isLastInFloor = f.numFollowFloorBlocks == 1 282 f.numFollowFloorBlocks-- 283 284 if f.isLastInFloor { 285 f.nextFloorLabel = 256 286 fmt.Printf(" stop! last block nextFloorLabel=%x\n", f.nextFloorLabel) 287 break 288 } else { 289 panic("niy") 290 } 291 } 292 293 if newFP != f.fp { 294 // Force re-load of the block: 295 fmt.Printf(" force switch to fp=%v oldFP=%v\n", newFP, f.fp) 296 f.nextEnt = -1 297 f.fp = newFP 298 } else { 299 // 300 } 301 } 302 303 func (f *segmentTermsEnumFrame) decodeMetaData() (err error) { 304 // fmt.Printf("BTTR.decodeMetadata seg=%v mdUpto=%v vs termBlockOrd=%v\n", 305 // f.ste.fr.parent.segment, f.metaDataUpto, f.state.TermBlockOrd) 306 307 // lazily catch up on metadata decode: 308 limit := f.getTermBlockOrd() 309 absolute := f.metaDataUpto == 0 310 assert(limit > 0) 311 312 // TODO: better API would be "jump straight to term=N"??? 313 for f.metaDataUpto < limit { 314 // TODO: we could make "tiers" of metadata, ie, 315 // decode docFreq/totalTF but don't decode postings 316 // metadata; this way caller could get 317 // docFreq/totalTF w/o paying decode cost for 318 // postings 319 320 // TODO: if docFreq were bulk decoded we could 321 // just skipN here: 322 323 // stats 324 if f.state.DocFreq, err = asInt(f.statsReader.ReadVInt()); err != nil { 325 return err 326 } 327 // fmt.Printf(" dF=%v\n", f.state.DocFreq) 328 if f.ste.fr.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY { 329 var n int64 330 if n, err = f.statsReader.ReadVLong(); err != nil { 331 return err 332 } 333 f.state.TotalTermFreq = int64(f.state.DocFreq) + n 334 // fmt.Printf(" totTF=%v\n", f.state.TotalTermFreq) 335 } 336 337 // metadata 338 for i := 0; i < f.ste.fr.longsSize; i++ { 339 if f.longs[i], err = f.bytesReader.ReadVLong(); err != nil { 340 return err 341 } 342 } 343 344 if err = f.ste.fr.parent.postingsReader.DecodeTerm(f.longs, 345 f.bytesReader, f.ste.fr.fieldInfo, f.state, absolute); err != nil { 346 return err 347 } 348 f.metaDataUpto++ 349 absolute = false 350 } 351 352 f.state.TermBlockOrd = f.metaDataUpto 353 return nil 354 } 355 356 // Used only by assert 357 func (f *segmentTermsEnumFrame) prefixMatches(target []byte) bool { 358 for i := 0; i < f.prefix; i++ { 359 if target[i] != f.ste.term.At(i) { 360 return false 361 } 362 } 363 return true 364 } 365 366 // NOTE: sets startBytePos/suffix as a side effect 367 func (f *segmentTermsEnumFrame) scanToTerm(target []byte, exactOnly bool) (status SeekStatus, err error) { 368 if f.isLeafBlock { 369 return f.scanToTermLeaf(target, exactOnly) 370 } 371 return f.scanToTermNonLeaf(target, exactOnly) 372 } 373 374 // Target's prefix matches this block's prefix; we 375 // scan the entries check if the suffix matches. 376 func (f *segmentTermsEnumFrame) scanToTermLeaf(target []byte, exactOnly bool) (status SeekStatus, err error) { 377 // fmt.Printf(" scanToTermLeaf: block fp=%v prefix=%v nextEnt=%v (of %v) target=%v term=%v\n", 378 // f.fp, f.prefix, f.nextEnt, f.entCount, brToString(target), f.ste.term) 379 assert(f.nextEnt != -1) 380 381 f.ste.termExists = true 382 f.subCode = 0 383 if f.nextEnt == f.entCount { 384 if exactOnly { 385 f.fillTerm() 386 } 387 return SEEK_STATUS_END, nil 388 } 389 390 if !f.prefixMatches(target) { 391 panic("assert fail") 392 } 393 394 // Loop over each entry (term or sub-block) in this block: 395 //nextTerm: while(nextEnt < entCount) { 396 for { 397 f.nextEnt++ 398 f.suffix, err = asInt(f.suffixesReader.ReadVInt()) 399 if err != nil { 400 return 0, err 401 } 402 403 // suffixReaderPos := f.suffixesReader.Pos 404 // fmt.Printf(" cycle: term %v (of %v) suffix=%v\n", 405 // f.nextEnt-1, f.entCount, brToString(f.suffixBytes[suffixReaderPos:suffixReaderPos+f.suffix])) 406 407 termLen := f.prefix + f.suffix 408 f.startBytePos = f.suffixesReader.Pos 409 f.suffixesReader.SkipBytes(int64(f.suffix)) 410 411 targetLimit := termLen 412 if len(target) < termLen { 413 targetLimit = len(target) 414 } 415 targetPos := f.prefix 416 417 // Loop over bytes in the suffix, comparing to 418 // the target 419 bytePos := f.startBytePos 420 isDone := false 421 for { 422 var cmp int 423 var stop bool 424 if targetPos < targetLimit { 425 cmp = int(f.suffixBytes[bytePos]) - int(target[targetPos]) 426 bytePos++ 427 targetPos++ 428 stop = false 429 } else { 430 if targetPos != targetLimit { 431 panic("assert fail") 432 } 433 cmp = termLen - len(target) 434 stop = true 435 } 436 437 if cmp < 0 { 438 // Current entry is still before the target; 439 // keep scanning 440 441 if f.nextEnt == f.entCount { 442 if exactOnly { 443 f.fillTerm() 444 } 445 // We are done scanning this block 446 isDone = true 447 } 448 break 449 } else if cmp > 0 { 450 // // Done! Current entry is after target -- 451 // // return NOT_FOUND: 452 f.fillTerm() 453 454 // fmt.Println(" not found") 455 return SEEK_STATUS_NOT_FOUND, nil 456 } else if stop { 457 // Exact match! 458 459 // This cannot be a sub-block because we 460 // would have followed the index to this 461 // sub-block from the start: 462 463 assert(f.ste.termExists) 464 f.fillTerm() 465 // fmt.Println(" found!") 466 return SEEK_STATUS_FOUND, nil 467 } 468 } 469 if isDone { 470 // double jump 471 break 472 } 473 } 474 475 // It is possible (and OK) that terms index pointed us 476 // at this block, but, we scanned the entire block and 477 // did not find the term to position to. This happens 478 // when the target is after the last term in the block 479 // (but, before the next term in the index). EG 480 // target could be foozzz, and terms index pointed us 481 // to the foo* block, but the last term in this block 482 // was fooz (and, eg, first term in the next block will 483 // bee fop). 484 fmt.Println(" block end") 485 if exactOnly { 486 f.fillTerm() 487 } 488 489 // TODO: not consistent that in the 490 // not-exact case we don't next() into the next 491 // frame here 492 return SEEK_STATUS_END, nil 493 } 494 495 // Target's prefix matches this block's prefix; we 496 // scan the entries check if the suffix matches. 497 func (f *segmentTermsEnumFrame) scanToTermNonLeaf(target []byte, 498 exactOnly bool) (status SeekStatus, err error) { 499 500 fmt.Printf( 501 " scanToTermNonLeaf: block fp=%v prefix=%v nextEnt=%v (of %v) target=%v term=%v", 502 f.fp, f.prefix, f.nextEnt, f.entCount, brToString(target), "" /*brToString(term)*/) 503 504 assert(f.nextEnt != -1) 505 506 if f.nextEnt == f.entCount { 507 panic("not implemented yet") 508 } 509 510 assert(f.prefixMatches(target)) 511 512 // Loop over each entry (term or sub-block) in this block: 513 for { 514 f.nextEnt++ 515 516 code, _ := f.suffixesReader.ReadVInt() // no error 517 f.suffix = int(uint32(code) >> 1) 518 519 f.ste.termExists = (code & 1) == 0 520 termLen := f.prefix + f.suffix 521 f.startBytePos = f.suffixesReader.Position() 522 f.suffixesReader.SkipBytes(int64(f.suffix)) 523 if f.ste.termExists { 524 f.state.TermBlockOrd++ 525 f.subCode = 0 526 } else { 527 f.subCode, _ = f.suffixesReader.ReadVLong() // no error 528 f.lastSubFP = f.fp - f.subCode 529 } 530 531 targetLimit := termLen 532 if len(target) < termLen { 533 targetLimit = len(target) 534 } 535 targetPos := f.prefix 536 537 // Loop over bytes in the suffix, comparing to the target 538 bytePos := f.startBytePos 539 var toNextTerm, stopScan bool 540 for { 541 var cmp int 542 var stop bool 543 if targetPos < targetLimit { 544 cmp = int(f.suffixBytes[bytePos]) - int(target[targetPos]) 545 bytePos++ 546 targetPos++ 547 stop = false 548 } else { 549 assert(targetPos == targetLimit) 550 cmp = termLen - len(target) 551 stop = true 552 } 553 554 if cmp < 0 { 555 // Current entry is still before the target; 556 // keep scanning 557 558 if f.nextEnt == f.entCount { 559 if exactOnly { 560 f.fillTerm() 561 } 562 // We are done scanning this block 563 stopScan = true 564 break 565 } else { 566 toNextTerm = true 567 break 568 } 569 } else if cmp > 0 { 570 // Done! Current entry is after target -- return NOT_FOUND: 571 f.fillTerm() 572 573 if !exactOnly && !f.ste.termExists { 574 panic("niy") 575 } 576 577 fmt.Println(" not found") 578 return SEEK_STATUS_NOT_FOUND, nil 579 } else if stop { 580 // Exact match! 581 582 // This cannot be a sub-block because we would have followed 583 // the index to this sub-block from the start: 584 585 assert(f.ste.termExists) 586 f.fillTerm() 587 fmt.Println(" found!") 588 return SEEK_STATUS_FOUND, nil 589 } 590 } 591 if toNextTerm { 592 continue 593 } 594 if stopScan { 595 break 596 } 597 } 598 599 // It is possible (and OK) that terms index pointed us at this 600 // block, but, we scanned the entire block and did not find the 601 // term to position to. This happens when the taret is after the 602 // last term in the block (but, before the next term in the index). 603 // E.g., target could be foozzz, and terms index pointed us to the 604 // foo* block, but the last term in this block was fooz (and, e.g., 605 // first term in the next block will be fop). 606 fmt.Println(" block end") 607 if exactOnly { 608 f.fillTerm() 609 } 610 611 return SEEK_STATUS_END, nil 612 } 613 614 func (f *segmentTermsEnumFrame) fillTerm() { 615 termLength := f.prefix + f.suffix 616 f.ste.term.SetLength(termLength) 617 f.ste.term.Grow(termLength) 618 copy(f.ste.term.Bytes()[f.prefix:], f.suffixBytes[f.startBytePos:f.startBytePos+f.suffix]) 619 } 620 621 // for debugging 622 func brToString(b []byte) string { 623 if b == nil { 624 return "nil" 625 } else { 626 var buf bytes.Buffer 627 buf.WriteString("[") 628 for i, v := range b { 629 if i > 0 { 630 buf.WriteString(" ") 631 } 632 fmt.Fprintf(&buf, "%x", v) 633 } 634 buf.WriteString("]") 635 return fmt.Sprintf("%v %v", utf8ToString(b), buf.String()) 636 } 637 } 638 639 // Simpler version of Lucene's own method 640 func utf8ToString(iso8859_1_buf []byte) string { 641 // buf := make([]rune, len(iso8859_1_buf)) 642 // for i, b := range iso8859_1_buf { 643 // buf[i] = rune(b) 644 // } 645 // return string(buf) 646 // TODO remove this method 647 return string(iso8859_1_buf) 648 } 649 650 // // Lucene's BytesRef is basically Slice in Go, except here 651 // // that it's used as a local buffer when data is filled with 652 // // length unchanged temporarily. 653 // type bytesRef struct { 654 // /** The contents of the BytesRef. Should never be {@code null}. */ 655 // bytes []byte 656 // /** Length of used bytes. */ 657 // length int 658 // } 659 660 // func newBytesRef() *bytesRef { 661 // return &bytesRef{} 662 // } 663 664 // func (br *bytesRef) toBytes() []byte { 665 // return br.bytes[0:br.length] 666 // } 667 668 // func (br *bytesRef) ensureSize(minSize int) { 669 // assert(minSize >= 0) 670 // if cap(br.bytes) < minSize { 671 // next := make([]byte, util.Oversize(minSize, 1)) 672 // copy(next, br.bytes) 673 // br.bytes = next 674 // } 675 // } 676 677 // func (br *bytesRef) String() string { 678 // return brToString(br.bytes[0:br.length]) 679 // } 680 681 // /** 682 // * Copies the bytes from the given {@link BytesRef} 683 // * <p> 684 // * NOTE: if this would exceed the array size, this method creates a 685 // * new reference array. 686 // */ 687 // func (br *bytesRef) copyBytes(other []byte) { 688 // if cap(br.bytes) < len(other) { 689 // next := make([]byte, len(other)) 690 // br.bytes = next 691 // } else if len(br.bytes) < len(other) { 692 // br.bytes = br.bytes[0:len(other)] 693 // } 694 // copy(br.bytes, other) 695 // br.length = len(other) 696 // }