github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/segmentTermEnum.go (about) 1 package blocktree 2 3 import ( 4 "fmt" 5 . "github.com/balzaczyy/golucene/core/index/model" 6 "github.com/balzaczyy/golucene/core/store" 7 "github.com/balzaczyy/golucene/core/util" 8 "github.com/balzaczyy/golucene/core/util/fst" 9 "sort" 10 // "strconv" 11 ) 12 13 // blocktree/SegmentTermsEnum.java 14 15 var fstOutputs = fst.ByteSequenceOutputsSingleton() 16 var noOutput = fstOutputs.NoOutput() 17 18 // Iterates through terms in this field 19 type SegmentTermsEnum struct { 20 *TermsEnumImpl 21 22 // lazy init: 23 in store.IndexInput 24 25 stack []*segmentTermsEnumFrame 26 staticFrame *segmentTermsEnumFrame 27 currentFrame *segmentTermsEnumFrame 28 termExists bool 29 fr *FieldReader 30 31 targetBeforeCurrentLength int 32 33 // What prefix of the current term was present in the index: 34 scratchReader *store.ByteArrayDataInput 35 36 // What prefix of the current term was present in the index:; when 37 // we only next() through the index, this stays at 0. It's only set 38 // when we seekCeil/Exact: 39 validIndexPrefix int 40 41 // assert only: 42 eof bool 43 44 term *util.BytesRefBuilder 45 fstReader fst.BytesReader 46 47 arcs []*fst.Arc 48 } 49 50 func newSegmentTermsEnum(r *FieldReader) *SegmentTermsEnum { 51 ans := &SegmentTermsEnum{ 52 fr: r, 53 stack: make([]*segmentTermsEnumFrame, 0), 54 scratchReader: store.NewEmptyByteArrayDataInput(), 55 term: util.NewBytesRefBuilder(), 56 arcs: make([]*fst.Arc, 1), 57 } 58 ans.TermsEnumImpl = NewTermsEnumImpl(ans) 59 // fmt.Printf("BTTR.init seg=%v\n", r.parent.segment) 60 61 // Used to hold seek by TermState, or cached seek 62 ans.staticFrame = newFrame(ans, -1) 63 64 if r.index != nil { 65 ans.fstReader = r.index.BytesReader() 66 } 67 68 // Init w/ root block; don't use index since it may 69 // not (and need not) have been loaded 70 for i, _ := range ans.arcs { 71 ans.arcs[i] = &fst.Arc{} 72 } 73 74 ans.currentFrame = ans.staticFrame 75 var arc *fst.Arc 76 if r.index != nil { 77 arc = r.index.FirstArc(ans.arcs[0]) 78 // Empty string prefix must have an output in the index! 79 if !arc.IsFinal() { 80 panic("assert fail") 81 } 82 } 83 ans.validIndexPrefix = 0 84 // fmt.Printf("init frame state %v\n", ans.currentFrame.ord) 85 ans.printSeekState() 86 87 // ans.computeBlockStats() 88 89 return ans 90 } 91 92 func (e *SegmentTermsEnum) initIndexInput() { 93 if e.in == nil { 94 e.in = e.fr.parent.in.Clone() 95 } 96 } 97 98 func (e *SegmentTermsEnum) frame(ord int) *segmentTermsEnumFrame { 99 if ord == len(e.stack) { 100 e.stack = append(e.stack, newFrame(e, ord)) 101 } else if ord > len(e.stack) { 102 // TODO over-allocate to ensure performance 103 next := make([]*segmentTermsEnumFrame, 1+ord) 104 copy(next, e.stack) 105 for i := len(e.stack); i < len(next); i++ { 106 next[i] = newFrame(e, i) 107 } 108 e.stack = next 109 } 110 assert(e.stack[ord].ord == ord) 111 return e.stack[ord] 112 } 113 114 func (e *SegmentTermsEnum) getArc(ord int) *fst.Arc { 115 if ord == len(e.arcs) { 116 e.arcs = append(e.arcs, &fst.Arc{}) 117 } else if ord > len(e.arcs) { 118 // TODO over-allocate 119 next := make([]*fst.Arc, 1+ord) 120 copy(next, e.arcs) 121 for i := len(e.arcs); i < len(next); i++ { 122 next[i] = &fst.Arc{} 123 } 124 e.arcs = next 125 } 126 return e.arcs[ord] 127 } 128 129 func (e *SegmentTermsEnum) Comparator() sort.Interface { 130 panic("not implemented yet") 131 } 132 133 // Pushes a frame we seek'd to 134 func (e *SegmentTermsEnum) pushFrame(arc *fst.Arc, frameData []byte, length int) (f *segmentTermsEnumFrame, err error) { 135 // fmt.Println("Pushing frame...") 136 e.scratchReader.Reset(frameData) 137 code, err := e.scratchReader.ReadVLong() 138 if err != nil { 139 return nil, err 140 } 141 fpSeek := int64(uint64(code) >> BTT_OUTPUT_FLAGS_NUM_BITS) 142 f = e.frame(1 + e.currentFrame.ord) 143 f.hasTerms = (code & BTT_OUTPUT_FLAG_HAS_TERMS) != 0 144 f.hasTermsOrig = f.hasTerms 145 f.isFloor = (code & BTT_OUTPUT_FLAG_IS_FLOOR) != 0 146 if f.isFloor { 147 f.setFloorData(e.scratchReader, frameData) 148 } 149 e.pushFrameAt(arc, fpSeek, length) 150 return f, err 151 } 152 153 // Pushes next'd frame or seek'd frame; we later 154 // lazy-load the frame only when needed 155 func (e *SegmentTermsEnum) pushFrameAt(arc *fst.Arc, fp int64, length int) (f *segmentTermsEnumFrame, err error) { 156 f = e.frame(1 + e.currentFrame.ord) 157 f.arc = arc 158 if f.fpOrig == fp && f.nextEnt != -1 { 159 // fmt.Printf(" push reused frame ord=%v fp=%v isFloor?=%v hasTerms=%v pref=%v nextEnt=%v targetBeforeCurrentLength=%v term.length=%v vs prefix=%v\n", 160 // f.ord, f.fp, f.isFloor, f.hasTerms, e.term, f.nextEnt, e.targetBeforeCurrentLength, e.term.length, f.prefix) 161 if f.ord > e.targetBeforeCurrentLength { 162 f.rewind() 163 } else { 164 // fmt.Println(" skip rewind!") 165 } 166 if length != f.prefix { 167 panic("assert fail") 168 } 169 } else { 170 f.nextEnt = -1 171 f.prefix = length 172 f.state.TermBlockOrd = 0 173 f.fpOrig, f.fp = fp, fp 174 f.lastSubFP = -1 175 // fmt.Printf(" push new frame ord=%v fp=%v hasTerms=%v isFloor=%v pref=%v\n", 176 // f.ord, f.fp, f.hasTerms, f.isFloor, e.term) 177 } 178 return f, nil 179 } 180 181 func (e *SegmentTermsEnum) SeekExact(target []byte) (ok bool, err error) { 182 assert2(e.fr.index != nil, "terms index was not loaded") 183 184 e.term.Grow(1 + len(target)) 185 186 e.eof = false 187 // fmt.Printf("BTTR.seekExact seg=%v target=%v:%v current=%v (exists?=%v) validIndexPrefix=%v\n", 188 // e.fr.parent.segment, e.fr.fieldInfo.Name, brToString(target), 189 // brToString(e.term.bytes), e.termExists, e.validIndexPrefix) 190 e.printSeekState() 191 192 var arc *fst.Arc 193 var targetUpto int 194 var output interface{} 195 196 e.targetBeforeCurrentLength = e.currentFrame.ord 197 198 // if e.currentFrame != e.staticFrame { 199 if e.currentFrame.ord != e.staticFrame.ord { 200 // We are already seek'd; find the common 201 // prefix of new seek term vs current term and 202 // re-use the corresponding seek state. For 203 // example, if app first seeks to foobar, then 204 // seeks to foobaz, we can re-use the seek state 205 // for the first 5 bytes. 206 207 // fmt.Printf(" re-use current seek state validIndexPrefix=%v\n", e.validIndexPrefix) 208 209 arc = e.arcs[0] 210 assert(arc.IsFinal()) 211 output = arc.Output 212 targetUpto = 0 213 214 lastFrame := e.stack[0] 215 assert(e.validIndexPrefix <= e.term.Length()) 216 217 targetLimit := len(target) 218 if e.validIndexPrefix < targetLimit { 219 targetLimit = e.validIndexPrefix 220 } 221 222 cmp := 0 223 224 // TODO: reverse vLong byte order for better FST 225 // prefix output sharing 226 227 // noOutputs := e.fstOutputs.NoOutput() 228 229 // First compare up to valid seek frames: 230 for targetUpto < targetLimit { 231 cmp = int(e.term.At(targetUpto)) - int(target[targetUpto]) 232 // fmt.Printf(" cycle targetUpto=%v (vs limit=%v) cmp=%v (targetLabel=%c vs termLabel=%c) arc.output=%v output=%v\n", 233 // targetUpto, targetLimit, cmp, target[targetUpto], e.term.bytes[targetUpto], arc.Output, output) 234 if cmp != 0 { 235 break 236 } 237 238 arc = e.arcs[1+targetUpto] 239 assert2(arc.Label == int(target[targetUpto]), 240 "arc.label=%c targetLabel=%c", arc.Label, target[targetUpto]) 241 panic("not implemented yet") 242 // if arc.Output != noOutputs { 243 // output = e.fstOutputs.Add(output, arc.Output).([]byte) 244 // } 245 if arc.IsFinal() { 246 lastFrame = e.stack[1+lastFrame.ord] 247 } 248 targetUpto++ 249 } 250 251 if cmp == 0 { 252 targetUptoMid := targetUpto 253 254 // Second compare the rest of the term, but 255 // don't save arc/output/frame; we only do this 256 // to find out if the target term is before, 257 // equal or after the current term 258 targetLimit2 := len(target) 259 if e.term.Length() < targetLimit2 { 260 targetLimit2 = e.term.Length() 261 } 262 for targetUpto < targetLimit2 { 263 cmp = int(e.term.At(targetUpto)) - int(target[targetUpto]) 264 // fmt.Printf(" cycle2 targetUpto=%v (vs limit=%v) cmp=%v (targetLabel=%c vs termLabel=%c)\n", 265 // targetUpto, targetLimit, cmp, target[targetUpto], e.term.bytes[targetUpto]) 266 if cmp != 0 { 267 break 268 } 269 targetUpto++ 270 } 271 272 if cmp == 0 { 273 cmp = e.term.Length() - len(target) 274 } 275 targetUpto = targetUptoMid 276 } 277 278 if cmp < 0 { 279 // Common case: target term is after current 280 // term, ie, app is seeking multiple terms 281 // in sorted order 282 // fmt.Printf(" target is after current (shares prefixLen=%v); frame.ord=%v\n", targetUpto, lastFrame.ord) 283 e.currentFrame = lastFrame 284 } else if cmp > 0 { 285 // Uncommon case: target term 286 // is before current term; this means we can 287 // keep the currentFrame but we must rewind it 288 // (so we scan from the start) 289 e.targetBeforeCurrentLength = lastFrame.ord 290 // fmt.Printf(" target is before current (shares prefixLen=%v); rewind frame ord=%v\n", targetUpto, lastFrame.ord) 291 e.currentFrame = lastFrame 292 e.currentFrame.rewind() 293 } else { 294 // Target is exactly the same as current term 295 assert(e.term.Length() == len(target)) 296 if e.termExists { 297 // fmt.Println(" target is same as current; return true") 298 return true, nil 299 } else { 300 // fmt.Println(" target is same as current but term doesn't exist") 301 } 302 } 303 } else { 304 e.targetBeforeCurrentLength = -1 305 arc = e.fr.index.FirstArc(e.arcs[0]) 306 307 // Empty string prefix must have an output (block) in the index! 308 assert(arc.IsFinal() && arc.Output != nil) 309 310 // fmt.Println(" no seek state; push root frame") 311 312 output = arc.Output 313 314 e.currentFrame = e.staticFrame 315 316 targetUpto = 0 317 if e.currentFrame, err = e.pushFrame(arc, fstOutputs.Add(output, arc.NextFinalOutput).([]byte), 0); err != nil { 318 return false, err 319 } 320 } 321 322 // fmt.Printf(" start index loop targetUpto=%v output=%v currentFrame.ord=%v targetBeforeCurrentLength=%v\n", 323 // targetUpto, output, e.currentFrame.ord, e.targetBeforeCurrentLength) 324 325 for targetUpto < len(target) { 326 targetLabel := int(target[targetUpto]) 327 nextArc, err := e.fr.index.FindTargetArc(targetLabel, arc, e.getArc(1+targetUpto), e.fstReader) 328 if err != nil { 329 return false, err 330 } 331 if nextArc == nil { 332 // Index is exhausted 333 // fmt.Printf(" index: index exhausted label=%c %x\n", targetLabel, targetLabel) 334 335 e.validIndexPrefix = e.currentFrame.prefix 336 337 e.currentFrame.scanToFloorFrame(target) 338 339 if !e.currentFrame.hasTerms { 340 e.termExists = false 341 e.term.Set(targetUpto, byte(targetLabel)) 342 e.term.SetLength(1 + targetUpto) 343 // fmt.Printf(" FAST NOT_FOUND term=%v\n", e.term) 344 return false, nil 345 } 346 347 if err := e.currentFrame.loadBlock(); err != nil { 348 return false, err 349 } 350 351 status, err := e.currentFrame.scanToTerm(target, true) 352 if err != nil { 353 return false, err 354 } 355 if status == SEEK_STATUS_FOUND { 356 // fmt.Printf(" return FOUND term=%v\n", e.term) 357 return true, nil 358 } else { 359 // fmt.Printf(" got %v; return NOT_FOUND term=%v\n", status, e.term) 360 return false, nil 361 } 362 } else { 363 // Follow this arc 364 arc = nextArc 365 e.term.Set(targetUpto, byte(targetLabel)) 366 // aggregate output as we go: 367 assert(arc.Output != nil) 368 if !fst.CompareFSTValue(arc.Output, noOutput) { 369 output = fstOutputs.Add(output, arc.Output) 370 } 371 // fmt.Printf(" index: follow label=%x arc.output=%v arc.nfo=%v\n", 372 // strconv.FormatInt(int64(target[targetUpto]), 16), arc.Output, arc.NextFinalOutput) 373 targetUpto++ 374 375 if arc.IsFinal() { 376 // fmt.Println(" arc is final!") 377 if e.currentFrame, err = e.pushFrame(arc, 378 fstOutputs.Add(output, arc.NextFinalOutput).([]byte), 379 targetUpto); err != nil { 380 return false, err 381 } 382 // fmt.Printf(" curFrame.ord=%v hasTerms=%v\n", e.currentFrame.ord, e.currentFrame.hasTerms) 383 } 384 } 385 } 386 387 e.validIndexPrefix = e.currentFrame.prefix 388 389 e.currentFrame.scanToFloorFrame(target) 390 391 // Target term is entirely contained in the index: 392 if !e.currentFrame.hasTerms { 393 e.termExists = false 394 e.term.SetLength(targetUpto) 395 // log.Printf(" FAST NOT_FOUND term=%v", e.term) 396 return false, nil 397 } 398 399 if err := e.currentFrame.loadBlock(); err != nil { 400 return false, err 401 } 402 403 status, err := e.currentFrame.scanToTerm(target, true) 404 if err != nil { 405 return false, err 406 } 407 if status == SEEK_STATUS_FOUND { 408 // log.Printf(" return FOUND term=%v", e.term) 409 return true, nil 410 } else { 411 // log.Printf(" got result %v; return NOT_FOUND term=%v", status, e.term) 412 return false, nil 413 } 414 } 415 416 func (e *SegmentTermsEnum) SeekCeil(text []byte) SeekStatus { 417 panic("not implemented yet") 418 } 419 420 func (e *SegmentTermsEnum) printSeekState() { 421 if e.currentFrame == e.staticFrame { 422 // log.Println(" no prior seek") 423 } else { 424 // log.Println(" prior seek state:") 425 ord := 0 426 isSeekFrame := true 427 for { 428 f := e.frame(ord) 429 assert(f != nil) 430 // prefix := e.term.Bytes()[:f.prefix] 431 if f.nextEnt == -1 { 432 // action := "(next)" 433 // if isSeekFrame { 434 // action = "(seek)" 435 // } 436 // fpOrigValue := "" 437 // if f.isFloor { 438 // fpOrigValue = fmt.Sprintf(" (fpOrig=%v", f.fpOrig) 439 // } 440 code := (f.fp << BTT_OUTPUT_FLAGS_NUM_BITS) 441 if f.hasTerms { 442 code += BTT_OUTPUT_FLAG_HAS_TERMS 443 } 444 if f.isFloor { 445 code += BTT_OUTPUT_FLAG_IS_FLOOR 446 } 447 // log.Printf(" frame %v ord=%v fp=%v%v prefixLen=%v prefix=%v hasTerms=%v isFloor=%v code=%v isLastInFloor=%v mdUpto=%v tbOrd=%v", 448 // action, ord, f.fp, fpOrigValue, f.prefix, prefix, f.hasTerms, f.isFloor, code, f.isLastInFloor, f.metaDataUpto, f.getTermBlockOrd()) 449 } else { 450 // action := "(next, loaded)" 451 // if isSeekFrame { 452 // action = "(seek, loaded)" 453 // } 454 // fpOrigValue := "" 455 // if f.isFloor { 456 // fpOrigValue = fmt.Sprintf(" (fpOrig=%v", f.fpOrig) 457 // } 458 code := (f.fp << BTT_OUTPUT_FLAGS_NUM_BITS) 459 if f.hasTerms { 460 code += BTT_OUTPUT_FLAG_HAS_TERMS 461 } 462 if f.isFloor { 463 code += BTT_OUTPUT_FLAG_IS_FLOOR 464 } 465 // log.Printf(" frame %v ord=%v fp=%v prefixLen=%v prefix=%v nextEnt=%v (of %v) hasTerms=%v isFloor=%v code=%v lastSubFP=%v isLastInFloor=%v mdUpto=%v tbOrd=%v", 466 // action, ord, f.fp, fpOrigValue, f.prefix, prefix, f.nextEnt, f.entCount, f.hasTerms, f.isFloor, code, f.lastSubFP, f.isLastInFloor, f.metaDataUpto, f.getTermBlockOrd()) 467 } 468 if e.fr.index != nil { 469 assert2(!isSeekFrame || f.arc != nil, 470 "isSeekFrame=%v f.arc=%v", isSeekFrame, f.arc) 471 panic("not implemented yet") 472 // ret, err := fst.GetFSTOutput(e.fr.index, prefix) 473 // if err != nil { 474 // panic(err) 475 // } 476 // output := ret.([]byte) 477 // if output == nil { 478 // // log.Println(" broken seek state: prefix is not final in index") 479 // panic("seek state is broken") 480 // } else if isSeekFrame && !f.isFloor { 481 // reader := store.NewByteArrayDataInput(output) 482 // codeOrig, _ := reader.ReadVLong() 483 // code := f.fp << BTT_OUTPUT_FLAGS_NUM_BITS 484 // if f.hasTerms { 485 // code += BTT_OUTPUT_FLAG_HAS_TERMS 486 // } 487 // if f.isFloor { 488 // code += BTT_OUTPUT_FLAG_IS_FLOOR 489 // } 490 // if codeOrig != code { 491 // // log.Printf(" broken seek state: output code=%v doesn't match frame code=%v", codeOrig, code) 492 // panic("seek state is broken") 493 // } 494 // } 495 } 496 if f == e.currentFrame { 497 break 498 } 499 if f.prefix == e.validIndexPrefix { 500 isSeekFrame = false 501 } 502 ord++ 503 } 504 } 505 } 506 507 func (e *SegmentTermsEnum) Next() (buf []byte, err error) { 508 panic("not implemented yet") 509 } 510 511 func (e *SegmentTermsEnum) Term() []byte { 512 assert(!e.eof) 513 return e.term.Bytes() 514 } 515 516 func assert(ok bool) { 517 if !ok { 518 panic("assert fail") 519 } 520 } 521 522 func assert2(ok bool, msg string, args ...interface{}) { 523 if !ok { 524 panic(fmt.Sprintf(msg, args...)) 525 } 526 } 527 528 func (e *SegmentTermsEnum) DocFreq() (df int, err error) { 529 assert(!e.eof) 530 // log.Println("BTTR.docFreq") 531 err = e.currentFrame.decodeMetaData() 532 df = e.currentFrame.state.DocFreq 533 // log.Printf(" return %v", df) 534 return 535 } 536 537 func (e *SegmentTermsEnum) TotalTermFreq() (tf int64, err error) { 538 assert(!e.eof) 539 err = e.currentFrame.decodeMetaData() 540 tf = e.currentFrame.state.TotalTermFreq 541 return 542 } 543 544 func (e *SegmentTermsEnum) DocsByFlags(skipDocs util.Bits, reuse DocsEnum, flags int) (de DocsEnum, err error) { 545 assert(!e.eof) 546 // log.Printf("BTTR.docs seg=%v", e.fr.parent.segment) 547 err = e.currentFrame.decodeMetaData() 548 if err != nil { 549 return nil, err 550 } 551 // log.Printf(" state=%v", e.currentFrame.state) 552 return e.fr.parent.postingsReader.Docs(e.fr.fieldInfo, e.currentFrame.state, skipDocs, reuse, flags) 553 } 554 555 func (e *SegmentTermsEnum) DocsAndPositionsByFlags(skipDocs util.Bits, reuse DocsAndPositionsEnum, flags int) DocsAndPositionsEnum { 556 panic("not implemented yet") 557 } 558 559 func (e *SegmentTermsEnum) SeekExactFromLast(target []byte, otherState TermState) error { 560 // log.Printf("BTTR.seekExact termState seg=%v target=%v state=%v", 561 // e.fr.parent.segment, brToString(target), otherState) 562 e.eof = false 563 if !fst.CompareFSTValue(target, e.term.Get()) || !e.termExists { 564 assert(otherState != nil) 565 // TODO can not assert type conversion here 566 // _, ok := otherState.(*BlockTermState) 567 // assert(ok) 568 e.currentFrame = e.staticFrame 569 e.currentFrame.state.CopyFrom(otherState) 570 e.term.Copy(target) 571 e.currentFrame.metaDataUpto = e.currentFrame.getTermBlockOrd() 572 assert(e.currentFrame.metaDataUpto > 0) 573 e.validIndexPrefix = 0 574 } else { 575 // log.Printf(" skip seek: already on target state=%v", e.currentFrame.state) 576 } 577 return nil 578 } 579 580 func copyBytes(a, b []byte) []byte { 581 if len(a) < len(b) { 582 a = make([]byte, len(b)) 583 } 584 copy(a, b) 585 return a[0:len(b)] 586 } 587 588 func (e *SegmentTermsEnum) TermState() (ts TermState, err error) { 589 assert(!e.eof) 590 if err = e.currentFrame.decodeMetaData(); err != nil { 591 return nil, err 592 } 593 ts = e.currentFrame.state.Clone() // <-- clone doesn't work here 594 // log.Printf("BTTR.termState seg=%v state=%v", e.fr.parent.segment, ts) 595 return 596 } 597 598 func (e *SegmentTermsEnum) SeekExactByPosition(ord int64) error { 599 panic("not implemented yet") 600 } 601 602 func (e *SegmentTermsEnum) Ord() int64 { 603 panic("not supported!") 604 } 605 606 func (e *SegmentTermsEnum) String() string { 607 return "SegmentTermsEnum" 608 }