github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/termsWriter.go (about) 1 package blocktree 2 3 import ( 4 "fmt" 5 "github.com/balzaczyy/golucene/core/codec" 6 . "github.com/balzaczyy/golucene/core/codec/spi" 7 . "github.com/balzaczyy/golucene/core/index/model" 8 "github.com/balzaczyy/golucene/core/store" 9 "github.com/balzaczyy/golucene/core/util" 10 "github.com/balzaczyy/golucene/core/util/fst" 11 "github.com/balzaczyy/golucene/core/util/packed" 12 "io" 13 "math" 14 "strings" 15 ) 16 17 // codec/PostingsWriterBase.java 18 19 /* 20 Extension of PostingsConsumer to support pluggable term dictionaries. 21 22 This class contains additional hooks to interact with the provided 23 term dictionaries such as BlockTreeTermsWriter. If you want to re-use 24 an existing implementation and are only interested in customizing the 25 format of the postings list, extend this class instead. 26 */ 27 type PostingsWriterBase interface { 28 codec.PostingsConsumer 29 io.Closer 30 31 // Called once after startup, before any terms have been added. 32 // Implementations typically write a header to the provided termsOut. 33 Init(store.IndexOutput) error 34 NewTermState() *BlockTermState 35 // Start a new term. Note that a matching call to finishTerm() is 36 // done, only if the term has at least one document. 37 StartTerm() error 38 // Finishes the current term. The provided TermStats contains the 39 // term's summary statistics. 40 FinishTerm(*BlockTermState) error 41 EncodeTerm([]int64, util.DataOutput, *FieldInfo, *BlockTermState, bool) error 42 // Called when the writing switches to another field. 43 SetField(fieldInfo *FieldInfo) int 44 } 45 46 // codec/BlockTreeTermsWriter.java 47 const ( 48 /* Suggested degault value for the minItemsInBlock parameter. */ 49 DEFAULT_MIN_BLOCK_SIZE = 25 50 51 /* Suggested default value for the maxItemsInBlock parameter. */ 52 DEFAULT_MAX_BLOCK_SIZE = 48 53 54 /* Extension of terms file */ 55 TERMS_EXTENSION = "tim" 56 TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT" 57 58 TERMS_VERSION_START = 0 59 /* Append-only */ 60 TERMS_VERSION_APPEND_ONLY = 1 61 TERMS_VERSION_META_ARRAY = 2 62 TERMS_VERSION_CHECKSUM = 3 63 TERMS_VERSION_MIN_MAX_TERMS = 4 64 /* Current terms format. */ 65 TERMS_VERSION_CURRENT = TERMS_VERSION_MIN_MAX_TERMS 66 67 /* Extension of terms index file */ 68 TERMS_INDEX_EXTENSION = "tip" 69 TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX" 70 ) 71 72 type BlockTreeTermsWriterSPI interface { 73 WriteHeader(store.IndexOutput) error 74 WriteIndexHeader(store.IndexOutput) error 75 } 76 77 type FieldMetaData struct { 78 fieldInfo *FieldInfo 79 rootCode []byte 80 numTerms int64 81 indexStartFP int64 82 sumTotalTermFreq int64 83 sumDocFreq int64 84 docCount int 85 longsSize int 86 minTerm []byte 87 maxTerm []byte 88 } 89 90 func newFieldMetaData(fieldInfo *FieldInfo, 91 rootCode []byte, numTerms, indexStartFP, sumTotalTermFreq, sumDocFreq int64, 92 docCount, longsSize int, minTerm, maxTerm []byte) *FieldMetaData { 93 assert(numTerms > 0) 94 assert2(rootCode != nil, "field=%v numTerms=%v", fieldInfo.Name, numTerms) 95 return &FieldMetaData{ 96 fieldInfo, 97 rootCode, 98 numTerms, 99 indexStartFP, 100 sumTotalTermFreq, 101 sumDocFreq, 102 docCount, 103 longsSize, 104 minTerm, 105 maxTerm, 106 } 107 } 108 109 type BlockTreeTermsWriter struct { 110 spi BlockTreeTermsWriterSPI 111 112 out store.IndexOutput 113 indexOut store.IndexOutput 114 maxDoc int 115 minItemsInBlock int 116 maxItemsInBlock int 117 118 postingsWriter PostingsWriterBase 119 fieldInfos FieldInfos 120 currentField *FieldInfo 121 122 fields []*FieldMetaData 123 segment string 124 125 scratchBytes *store.RAMOutputStream 126 scratchIntsRef *util.IntsRefBuilder 127 } 128 129 /* 130 Create a new writer. The number of items (terms or sub-blocks) per 131 block will aim tobe between minItermsPerBlock and maxItemsPerBlock, 132 though in some cases, the blocks may be smaller than the min. 133 */ 134 func NewBlockTreeTermsWriter(state *SegmentWriteState, 135 postingsWriter PostingsWriterBase, 136 minItemsInBlock, maxItemsInBlock int) (*BlockTreeTermsWriter, error) { 137 assert2(minItemsInBlock >= 2, "minItemsInBlock must be >= 2; got %v", minItemsInBlock) 138 assert2(maxItemsInBlock >= 1, "maxItemsInBlock must be >= 1; got %v", maxItemsInBlock) 139 assert2(minItemsInBlock <= maxItemsInBlock, 140 "maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=%v minItemsInBlock=%v", 141 maxItemsInBlock, minItemsInBlock) 142 assert2(2*(minItemsInBlock-1) <= maxItemsInBlock, 143 "maxItemsInBlock must be at least 2*(minItemsInBlock-1; got maxItemsInBlock=%v minItemsInBlock=%v", 144 maxItemsInBlock, minItemsInBlock) 145 146 ans := &BlockTreeTermsWriter{ 147 maxDoc: state.SegmentInfo.DocCount(), 148 fieldInfos: state.FieldInfos, 149 minItemsInBlock: minItemsInBlock, 150 maxItemsInBlock: maxItemsInBlock, 151 postingsWriter: postingsWriter, 152 segment: state.SegmentInfo.Name, 153 scratchBytes: store.NewRAMOutputStreamBuffer(), 154 scratchIntsRef: util.NewIntsRefBuilder(), 155 // bytesWriter: store.NewRAMOutputStreamBuffer(), 156 // bytesWriter2: store.NewRAMOutputStreamBuffer(), 157 } 158 ans.spi = ans 159 var out, indexOut store.IndexOutput 160 if err := func() error { 161 var success = false 162 defer func() { 163 if !success { 164 util.CloseWhileSuppressingError(out, indexOut) 165 } 166 }() 167 168 var err error 169 termsFileName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_EXTENSION) 170 if out, err = state.Directory.CreateOutput(termsFileName, state.Context); err != nil { 171 return err 172 } 173 if err = ans.spi.WriteHeader(out); err != nil { 174 return err 175 } 176 177 termsIndexFileName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION) 178 if indexOut, err = state.Directory.CreateOutput(termsIndexFileName, state.Context); err != nil { 179 return err 180 } 181 if err = ans.spi.WriteIndexHeader(indexOut); err != nil { 182 return err 183 } 184 185 // have consumer write its format/header 186 if err = postingsWriter.Init(out); err != nil { 187 return err 188 } 189 success = true 190 return nil 191 }(); err != nil { 192 return nil, err 193 } 194 ans.out = out 195 ans.indexOut = indexOut 196 return ans, nil 197 } 198 199 func (w *BlockTreeTermsWriter) WriteHeader(out store.IndexOutput) error { 200 return codec.WriteHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT) 201 } 202 203 func (w *BlockTreeTermsWriter) WriteIndexHeader(out store.IndexOutput) error { 204 return codec.WriteHeader(out, TERMS_INDEX_CODEC_NAME, TERMS_VERSION_CURRENT) 205 } 206 207 /* Writes the terms file trailer. */ 208 func (w *BlockTreeTermsWriter) writeTrailer(out store.IndexOutput, dirStart int64) error { 209 return out.WriteLong(dirStart) 210 } 211 212 /* Writes the index file trailer. */ 213 func (w *BlockTreeTermsWriter) writeIndexTrailer(indexOut store.IndexOutput, dirStart int64) error { 214 return indexOut.WriteLong(dirStart) 215 } 216 217 func (w *BlockTreeTermsWriter) AddField(field *FieldInfo) (TermsConsumer, error) { 218 assert(w.currentField == nil || w.currentField.Name < field.Name) 219 w.currentField = field 220 return newTermsWriter(w, field), nil 221 } 222 223 func encodeOutput(fp int64, hasTerms bool, isFloor bool) int64 { 224 assert(fp < (1 << 62)) 225 ans := (fp << 2) 226 if hasTerms { 227 ans |= BTT_OUTPUT_FLAG_HAS_TERMS 228 } 229 if isFloor { 230 ans |= BTT_OUTPUT_FLAG_IS_FLOOR 231 } 232 return ans 233 } 234 235 type PendingEntry interface { 236 isTerm() bool 237 } 238 239 type PendingTerm struct { 240 term []byte 241 // stats + metadata 242 state *BlockTermState 243 } 244 245 func newPendingTerm(term []byte, state *BlockTermState) *PendingTerm { 246 clone := make([]byte, len(term)) 247 copy(clone, term) 248 return &PendingTerm{clone, state} 249 } 250 251 func (t *PendingTerm) isTerm() bool { return true } 252 253 func (t *PendingTerm) String() string { panic("not implemented yet") } 254 255 type PendingBlock struct { 256 prefix []byte 257 fp int64 258 index *fst.FST 259 subIndices []*fst.FST 260 hasTerms bool 261 isFloor bool 262 floorLeadByte int 263 } 264 265 func newPendingBlock(prefix []byte, fp int64, hasTerms, isFloor bool, 266 floorLeadByte int, subIndices []*fst.FST) *PendingBlock { 267 return &PendingBlock{ 268 prefix: prefix, 269 fp: fp, 270 index: nil, 271 subIndices: subIndices, 272 hasTerms: hasTerms, 273 isFloor: isFloor, 274 floorLeadByte: floorLeadByte, 275 } 276 } 277 278 func (b *PendingBlock) isTerm() bool { return false } 279 280 func (b *PendingBlock) String() string { 281 return fmt.Sprintf("BLOCK: %v", utf8ToString(b.prefix)) 282 } 283 284 func (b *PendingBlock) compileIndex(blocks []*PendingBlock, 285 scratchBytes *store.RAMOutputStream, 286 scratchIntsRef *util.IntsRefBuilder) (err error) { 287 288 assert2(b.isFloor && len(blocks) > 1 || (!b.isFloor && len(blocks) == 1), 289 "isFloor=%v blocks=%v", b.isFloor, blocks) 290 assert(blocks[0] == b) 291 292 assert(scratchBytes.FilePointer() == 0) 293 294 // TODO: try writing the leading vLong in MSB order 295 // (opposite of what Lucene does today), for better 296 // outputs sharing in the FST 297 if err = scratchBytes.WriteVLong(encodeOutput(b.fp, b.hasTerms, b.isFloor)); err != nil { 298 return 299 } 300 if b.isFloor { 301 if err = scratchBytes.WriteVInt(int32(len(blocks) - 1)); err != nil { 302 return 303 } 304 for _, sub := range blocks[1:] { 305 assert(sub.floorLeadByte != -1) 306 // fmt.Printf(" write floorLeadByte=%v\n", util.ItoHex(int64(sub.floorLeadByte))) 307 if err = scratchBytes.WriteByte(byte(sub.floorLeadByte)); err != nil { 308 return 309 } 310 assert(sub.fp > b.fp) 311 if err = scratchBytes.WriteVLong((sub.fp-b.fp)<<1 | 312 int64(map[bool]int{true: 1, false: 0}[sub.hasTerms])); err != nil { 313 return 314 } 315 } 316 } 317 318 outputs := fst.ByteSequenceOutputsSingleton() 319 indexBuilder := fst.NewBuilder(fst.INPUT_TYPE_BYTE1, 320 0, 0, true, false, int(math.MaxInt32), 321 outputs, false, 322 packed.PackedInts.COMPACT, true, 15) 323 324 // fmt.Printf(" compile index for prefix=%v\n", b.prefix) 325 326 bytes := make([]byte, scratchBytes.FilePointer()) 327 assert(len(bytes) > 0) 328 err = scratchBytes.WriteToBytes(bytes) 329 if err != nil { 330 return err 331 } 332 err = indexBuilder.Add(fst.ToIntsRef(b.prefix, scratchIntsRef), bytes) 333 if err != nil { 334 return err 335 } 336 scratchBytes.Reset() 337 338 // copy over index for all sub-blocks 339 for _, block := range blocks { 340 if block.subIndices != nil { 341 for _, subIndex := range block.subIndices { 342 if err = b.append(indexBuilder, subIndex, scratchIntsRef); err != nil { 343 return err 344 } 345 } 346 } 347 block.subIndices = nil 348 } 349 350 if b.index, err = indexBuilder.Finish(); err != nil { 351 return err 352 } 353 assert(b.subIndices == nil) 354 return nil 355 } 356 357 func (b *PendingBlock) append( 358 builder *fst.Builder, 359 subIndex *fst.FST, 360 scratchIntsRef *util.IntsRefBuilder) error { 361 362 subIndexEnum := fst.NewBytesRefFSTEnum(subIndex) 363 indexEnt, err := subIndexEnum.Next() 364 for err == nil && indexEnt != nil { 365 // fmt.Printf(" add sub=%v output=%v\n", indexEnt.Input, indexEnt.Output) 366 err = builder.Add(fst.ToIntsRef(indexEnt.Input.ToBytes(), scratchIntsRef), indexEnt.Output) 367 if err == nil { 368 indexEnt, err = subIndexEnum.Next() 369 } 370 } 371 return err 372 } 373 374 type TermsWriter struct { 375 owner *BlockTreeTermsWriter 376 fieldInfo *FieldInfo 377 longsSize int 378 numTerms int64 379 docsSeen *util.FixedBitSet 380 sumTotalTermFreq int64 381 sumDocFreq int64 382 docCount int 383 indexStartFP int64 384 385 // Records index into pending where the current prefix at that 386 // length "started"; for example, if current term starts with 't', 387 // startsByPrefix[0] is the index into pending for the first 388 // term/sub-block starting with 't'. We use this to figure out when 389 // to write a new block: 390 lastTerm *util.BytesRefBuilder 391 prefixStarts []int 392 393 longs []int64 394 395 // Pending stack of terms and blocks. As terms arrive (in sorted 396 // order) we append to this stack, and once the top of the stak has 397 // enough terms starting with a common prefix, we write a new block 398 // with those terms and replace those terms in the stack with a new 399 // block: 400 pending []PendingEntry 401 402 // Reused in writeBlocks: 403 newBlocks []*PendingBlock 404 405 firstPendingTerm *PendingTerm 406 lastPendingTerm *PendingTerm 407 408 suffixWriter *store.RAMOutputStream 409 statsWriter *store.RAMOutputStream 410 metaWriter *store.RAMOutputStream 411 bytesWriter *store.RAMOutputStream 412 } 413 414 func newTermsWriter(owner *BlockTreeTermsWriter, 415 fieldInfo *FieldInfo) *TermsWriter { 416 owner.postingsWriter.SetField(fieldInfo) 417 ans := &TermsWriter{ 418 owner: owner, 419 fieldInfo: fieldInfo, 420 lastTerm: util.NewBytesRefBuilder(), 421 prefixStarts: make([]int, 8), 422 suffixWriter: store.NewRAMOutputStreamBuffer(), 423 statsWriter: store.NewRAMOutputStreamBuffer(), 424 metaWriter: store.NewRAMOutputStreamBuffer(), 425 bytesWriter: store.NewRAMOutputStreamBuffer(), 426 } 427 ans.longsSize = owner.postingsWriter.SetField(fieldInfo) 428 ans.longs = make([]int64, ans.longsSize) 429 return ans 430 } 431 432 /* Writes the top count entries in pending, using prevTerm to compute the prefix. */ 433 func (w *TermsWriter) writeBlocks(prefixLength, count int) (err error) { 434 assert(count > 0) 435 436 // Root block better writes all remaining pending entries: 437 assert(prefixLength > 0 || count == len(w.pending)) 438 439 lastSuffixLeadLabel := -1 440 441 // True if we saw at least one term in this block (we record if a 442 // block only points to sub-blocks in the terms index so we can 443 // avoid seeking to it when we are looking for a term): 444 hasTerms := false 445 hasSubBlocks := false 446 447 end := len(w.pending) 448 start := end - count 449 nextBlockStart := start 450 nextFloorLeadLabel := -1 451 452 for i, ent := range w.pending[start:] { 453 var suffixLeadLabel int 454 if ent.isTerm() { 455 term := ent.(*PendingTerm) 456 if len(term.term) == prefixLength { 457 // suffix is 0, ie prefix 'foo' and term is 'foo' so the 458 // term has empty string suffix in this block 459 assert(lastSuffixLeadLabel == -1) 460 suffixLeadLabel = -1 461 } else { 462 suffixLeadLabel = int(term.term[prefixLength]) 463 } 464 } else { 465 block := ent.(*PendingBlock) 466 assert(len(block.prefix) > prefixLength) 467 suffixLeadLabel = int(block.prefix[prefixLength]) 468 } 469 470 if suffixLeadLabel != lastSuffixLeadLabel { 471 if itemsInBlock := i + count - nextBlockStart; itemsInBlock >= w.owner.minItemsInBlock && 472 end-nextBlockStart > w.owner.maxItemsInBlock { 473 // The count is too large for one block, so we must break 474 // it into "floor" blocks, where we record the leading 475 // label of the suffix of the first term in each floor 476 // block, so at search time we can jump to the right floor 477 // block. We just use a naive greedy segmenter here: make a 478 // new floor block as soon as we have at least 479 // minItemsInBlock. This is not always best: it often 480 // produces a too-small block as the final block: 481 isFloor := itemsInBlock < count 482 var block *PendingBlock 483 if block, err = w.writeBlock(prefixLength, isFloor, 484 nextFloorLeadLabel, nextBlockStart, i+count, hasTerms, 485 hasSubBlocks); err != nil { 486 return 487 } 488 w.newBlocks = append(w.newBlocks, block) 489 490 hasTerms = false 491 hasSubBlocks = false 492 nextFloorLeadLabel = suffixLeadLabel 493 nextBlockStart = i + count 494 } 495 496 lastSuffixLeadLabel = suffixLeadLabel 497 } 498 499 if ent.isTerm() { 500 hasTerms = true 501 } else { 502 hasSubBlocks = true 503 } 504 } 505 506 // Write last block, if any: 507 if nextBlockStart < end { 508 itemsInBlock := end - nextBlockStart 509 isFloor := itemsInBlock < count 510 var block *PendingBlock 511 if block, err = w.writeBlock(prefixLength, isFloor, 512 nextFloorLeadLabel, nextBlockStart, end, hasTerms, 513 hasSubBlocks); err != nil { 514 return 515 } 516 w.newBlocks = append(w.newBlocks, block) 517 } 518 519 assert(len(w.newBlocks) > 0) 520 521 firstBlock := w.newBlocks[0] 522 523 assert(firstBlock.isFloor || len(w.newBlocks) == 1) 524 525 if err = firstBlock.compileIndex(w.newBlocks, 526 w.owner.scratchBytes, w.owner.scratchIntsRef); err != nil { 527 return 528 } 529 530 // Remove slice from the top of the pending stack, that we just wrote: 531 w.pending = w.pending[:start] 532 533 // Append new block 534 w.pending = append(w.pending, firstBlock) 535 536 w.newBlocks = nil 537 return nil 538 } 539 540 /* 541 Writes the specified slice (start is inclusive, end is exclusive) 542 from pending stack as a new block. If isFloor is true, there were too 543 many (more than maxItemsInBlock) entries sharing the same prefix, and 544 so we broke it into multiple floor blocks where we record the 545 starting lable of the suffix of each floor block. 546 */ 547 func (w *TermsWriter) writeBlock( 548 prefixLength int, 549 isFloor bool, 550 floorLeadLabel, start, end int, 551 hasTerms, hasSubBlocks bool) (*PendingBlock, error) { 552 553 assert(end > start) 554 555 startFP := w.owner.out.FilePointer() 556 557 hasFloorLeadLabel := isFloor && floorLeadLabel != -1 558 559 prefix := make([]byte, prefixLength) 560 copy(prefix, w.lastTerm.Bytes()[:prefixLength]) 561 562 // write block header: 563 numEntries := end - start 564 code := numEntries << 1 565 if end == len(w.pending) { // last block 566 code |= 1 567 } 568 var err error 569 if err = w.owner.out.WriteVInt(int32(code)); err != nil { 570 return nil, err 571 } 572 573 // fmt.Printf(" writeBlock %vseg=%v len(pending)=%v prefixLength=%v "+ 574 // "indexPrefix=%v entCount=%v startFP=%v futureTermCount=%v%v "+ 575 // "isLastInFloor=%v\n", 576 // map[bool]string{true: "(floor) "}[isFloor], 577 // w.owner.segment, 578 // len(w.pending), 579 // prefixLength, 580 // prefix, 581 // length, 582 // startFP, 583 // futureTermCount, 584 // map[bool]string{true: fmt.Sprintf(" floorLeadByte=%v", strconv.FormatInt(int64(floorLeadByte&0xff), 16))}[isFloor], 585 // isLastInFloor, 586 // ) 587 588 // 1st pass: pack term suffix bytes into []byte blob 589 // TODO: cutover to bulk int codec... simple64? 590 591 // We optimize the leaf block case (block has only terms), writing 592 // a more compact format in this case: 593 isLeafBlock := !hasSubBlocks 594 595 var subIndices []*fst.FST 596 597 var absolute = true 598 599 if isLeafBlock { // only terms 600 subIndices = nil 601 for i, ent := range w.pending[start:end] { 602 assert2(ent.isTerm(), "i=%v", i+start) 603 604 term := ent.(*PendingTerm) 605 assert2(strings.HasPrefix(string(term.term), string(prefix)), "term.term=%v prefix=%v", term.term, prefix) 606 state := term.state 607 suffix := len(term.term) - prefixLength 608 // for leaf block we write suffix straight 609 if err = w.suffixWriter.WriteVInt(int32(suffix)); err != nil { 610 return nil, err 611 } 612 if err = w.suffixWriter.WriteBytes(term.term[prefixLength : prefixLength+suffix]); err != nil { 613 return nil, err 614 } 615 assert(floorLeadLabel == -1 || int(term.term[prefixLength]) >= floorLeadLabel) 616 617 // write term stats, to separate []byte blob: 618 if err = w.statsWriter.WriteVInt(int32(state.DocFreq)); err != nil { 619 return nil, err 620 } 621 if w.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY { 622 assert2(state.TotalTermFreq >= int64(state.DocFreq), 623 "%v vs %v", state.TotalTermFreq, state.DocFreq) 624 if err := w.statsWriter.WriteVLong(state.TotalTermFreq - int64(state.DocFreq)); err != nil { 625 return nil, err 626 } 627 } 628 629 // Write term meta data 630 if err = w.owner.postingsWriter.EncodeTerm(w.longs, w.bytesWriter, w.fieldInfo, state, absolute); err != nil { 631 return nil, err 632 } 633 for _, v := range w.longs[:w.longsSize] { 634 assert(v >= 0) 635 if err = w.metaWriter.WriteVLong(v); err != nil { 636 return nil, err 637 } 638 } 639 if err = w.bytesWriter.WriteTo(w.metaWriter); err != nil { 640 return nil, err 641 } 642 w.bytesWriter.Reset() 643 absolute = false 644 } 645 646 } else { // mixed terms and sub-blocks 647 subIndices = nil 648 for _, ent := range w.pending[start:end] { 649 if ent.isTerm() { 650 term := ent.(*PendingTerm) 651 assert2(strings.HasPrefix(string(term.term), string(prefix)), "term.term=%v prefix=%v", term.term, prefix) 652 state := term.state 653 suffix := len(term.term) - prefixLength 654 // for non-leaf block we borrow 1 bit to record 655 // if entr is term or sub-block 656 if err = w.suffixWriter.WriteVInt(int32(suffix << 1)); err != nil { 657 return nil, err 658 } 659 if err = w.suffixWriter.WriteBytes(term.term[prefixLength : prefixLength+suffix]); err != nil { 660 return nil, err 661 } 662 assert(floorLeadLabel == -1 || int(term.term[prefixLength]) >= floorLeadLabel) 663 664 // write term stats, to separate []byte block: 665 if err = w.statsWriter.WriteVInt(int32(state.DocFreq)); err != nil { 666 return nil, err 667 } 668 if w.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY { 669 assert(state.TotalTermFreq >= int64(state.DocFreq)) 670 if err = w.statsWriter.WriteVLong(state.TotalTermFreq - int64(state.DocFreq)); err != nil { 671 return nil, err 672 } 673 } 674 675 // write term meta data 676 if err = w.owner.postingsWriter.EncodeTerm(w.longs, w.bytesWriter, w.fieldInfo, state, absolute); err != nil { 677 return nil, err 678 } 679 for _, v := range w.longs[:w.longsSize] { 680 assert(v >= 0) 681 if err = w.metaWriter.WriteVLong(v); err != nil { 682 return nil, err 683 } 684 } 685 if err = w.bytesWriter.WriteTo(w.metaWriter); err != nil { 686 return nil, err 687 } 688 w.bytesWriter.Reset() 689 absolute = false 690 691 } else { 692 block := ent.(*PendingBlock) 693 assert(strings.HasPrefix(string(block.prefix), string(prefix))) 694 suffix := len(block.prefix) - prefixLength 695 696 assert(suffix > 0) 697 698 // for non-leaf block we borrow 1 bit to record if entry is 699 // term or sub-block 700 if err = w.suffixWriter.WriteVInt(int32((suffix << 1) | 1)); err != nil { 701 return nil, err 702 } 703 if err = w.suffixWriter.WriteBytes(block.prefix[prefixLength : prefixLength+suffix]); err != nil { 704 return nil, err 705 } 706 707 assert(floorLeadLabel == -1 || int(block.prefix[prefixLength]) >= floorLeadLabel) 708 709 assert(block.fp < startFP) 710 711 if err = w.suffixWriter.WriteVLong(startFP - block.fp); err != nil { 712 return nil, err 713 } 714 subIndices = append(subIndices, block.index) 715 } 716 } 717 718 assert(len(subIndices) != 0) 719 } 720 721 // TODO: we could block-write the term suffix pointer 722 // this would take more space but would enable binary 723 // search on lookup 724 725 // write suffixes []byte blob to terms dict output: 726 if err = w.owner.out.WriteVInt( 727 int32(w.suffixWriter.FilePointer()<<1) | 728 (map[bool]int32{true: 1, false: 0}[isLeafBlock])); err != nil { 729 return nil, err 730 } 731 if err = w.suffixWriter.WriteTo(w.owner.out); err != nil { 732 return nil, err 733 } 734 w.suffixWriter.Reset() 735 736 // write term stats []byte blob 737 if err = w.owner.out.WriteVInt(int32(w.statsWriter.FilePointer())); err != nil { 738 return nil, err 739 } 740 if err = w.statsWriter.WriteTo(w.owner.out); err != nil { 741 return nil, err 742 } 743 w.statsWriter.Reset() 744 745 // Write term meta data []byte blob 746 if err = w.owner.out.WriteVInt(int32(w.metaWriter.FilePointer())); err != nil { 747 return nil, err 748 } 749 if err = w.metaWriter.WriteTo(w.owner.out); err != nil { 750 return nil, err 751 } 752 w.metaWriter.Reset() 753 754 if hasFloorLeadLabel { 755 prefix = append(prefix, byte(floorLeadLabel)) 756 } 757 758 return newPendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices), nil 759 } 760 761 func (w *TermsWriter) Comparator() func(a, b []byte) bool { 762 return util.UTF8SortedAsUnicodeLess 763 } 764 765 func (w *TermsWriter) StartTerm(text []byte) (codec.PostingsConsumer, error) { 766 assert(w.owner != nil) 767 assert(w.owner.postingsWriter != nil) 768 err := w.owner.postingsWriter.StartTerm() 769 return w.owner.postingsWriter, err 770 } 771 772 func (w *TermsWriter) FinishTerm(text []byte, stats *codec.TermStats) (err error) { 773 assert(stats.DocFreq > 0) 774 // fmt.Printf("BTTW.finishTerm term=%v:%v seg=%v df=%v\n", 775 // w.fieldInfo.Name, utf8ToString(text), w.owner.segment, stats.DocFreq) 776 777 assert2(w.fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY || 778 stats.TotalTermFreq >= int64(stats.DocFreq), 779 "postingsWriter=%v", w.owner.postingsWriter) 780 state := w.owner.postingsWriter.NewTermState() 781 state.DocFreq = stats.DocFreq 782 state.TotalTermFreq = stats.TotalTermFreq 783 if err = w.owner.postingsWriter.FinishTerm(state); err != nil { 784 return 785 } 786 787 w.sumDocFreq += int64(state.DocFreq) 788 w.sumTotalTermFreq += state.TotalTermFreq 789 if err = w.pushTerm(text); err != nil { 790 return 791 } 792 793 term := newPendingTerm(text, state) 794 w.pending = append(w.pending, term) 795 w.numTerms++ 796 if w.firstPendingTerm == nil { 797 w.firstPendingTerm = term 798 } 799 w.lastPendingTerm = term 800 return nil 801 } 802 803 /* Pushes the new term to the top of the stack, and writes new blocks. */ 804 func (w *TermsWriter) pushTerm(text []byte) error { 805 limit := w.lastTerm.Length() 806 if len(text) < limit { 807 limit = len(text) 808 } 809 810 // Find common prefix between last term and current term: 811 pos := 0 812 for pos < limit && w.lastTerm.At(pos) == text[pos] { 813 pos++ 814 } 815 816 // Close the "abandoned" suffix now: 817 for i := w.lastTerm.Length() - 1; i >= pos; i-- { 818 // How many items on top of the stack share the current suffix 819 // we are closing: 820 if prefixTopSize := len(w.pending) - w.prefixStarts[i]; prefixTopSize >= w.owner.minItemsInBlock { 821 if err := w.writeBlocks(i+1, prefixTopSize); err != nil { 822 return err 823 } 824 w.prefixStarts[i] -= prefixTopSize - 1 825 } 826 } 827 828 if len(w.prefixStarts) < len(text) { 829 w.prefixStarts = util.GrowIntSlice(w.prefixStarts, len(text)) 830 } 831 832 // Init new tail: 833 for i := pos; i < len(text); i++ { 834 w.prefixStarts[i] = len(w.pending) 835 } 836 837 w.lastTerm.Copy(text) 838 return nil 839 } 840 841 func (w *TermsWriter) Finish(sumTotalTermFreq, sumDocFreq int64, docCount int) (err error) { 842 if w.numTerms > 0 { 843 // Add empty term to force closing of all final blocks: 844 w.pushTerm(nil) 845 846 // TODO: if len(pending) is already 1 with a non-zero prefix length 847 // we can save writing a "degenerate" root block, but we have to 848 // fix all the palces that assume the root blocks' prefix is the empty string: 849 if err = w.writeBlocks(0, len(w.pending)); err != nil { 850 return err 851 } 852 853 // we better have one final "root" block: 854 assert2(len(w.pending) == 1 && !w.pending[0].isTerm(), 855 "len(pending) = %v pending=%v", len(w.pending), w.pending) 856 root := w.pending[0].(*PendingBlock) 857 assert2(len(root.prefix) == 0, "%v", root.prefix) 858 assert(root.index.EmptyOutput() != nil) 859 860 w.sumTotalTermFreq = sumTotalTermFreq 861 w.sumDocFreq = sumDocFreq 862 w.docCount = docCount 863 864 // Write FST to index 865 w.indexStartFP = w.owner.indexOut.FilePointer() 866 err = root.index.Save(w.owner.indexOut) 867 if err != nil { 868 return err 869 } 870 // fmt.Printf(" write FST %v field=%v\n", w.indexStartFP, w.fieldInfo.Name) 871 872 assert(w.firstPendingTerm != nil) 873 minTerm := w.firstPendingTerm.term 874 assert(w.lastPendingTerm != nil) 875 maxTerm := w.lastPendingTerm.term 876 877 w.owner.fields = append(w.owner.fields, newFieldMetaData( 878 w.fieldInfo, 879 w.pending[0].(*PendingBlock).index.EmptyOutput().([]byte), 880 w.numTerms, 881 w.indexStartFP, 882 sumTotalTermFreq, 883 sumDocFreq, 884 docCount, 885 w.longsSize, 886 minTerm, maxTerm)) 887 } else { 888 assert(sumTotalTermFreq == 0 || w.fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY && sumTotalTermFreq == -1) 889 assert(sumDocFreq == 0) 890 assert(docCount == 0) 891 } 892 return nil 893 } 894 895 func (w *BlockTreeTermsWriter) Close() (err error) { 896 var success = false 897 defer func() { 898 if success { 899 util.Close(w.out, w.indexOut, w.postingsWriter) 900 } else { 901 util.CloseWhileSuppressingError(w.out, w.indexOut, w.postingsWriter) 902 } 903 }() 904 905 dirStart := w.out.FilePointer() 906 indexDirStart := w.indexOut.FilePointer() 907 908 if err = w.out.WriteVInt(int32(len(w.fields))); err != nil { 909 return 910 } 911 912 for _, field := range w.fields { 913 // fmt.Printf(" field %v %v terms\n", field.fieldInfo.Name, field.numTerms) 914 if err = w.out.WriteVInt(field.fieldInfo.Number); err == nil { 915 assert(field.numTerms > 0) 916 if err = w.out.WriteVLong(field.numTerms); err == nil { 917 if err = w.out.WriteVInt(int32(len(field.rootCode))); err == nil { 918 err = w.out.WriteBytes(field.rootCode) 919 if err == nil && field.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY { 920 err = w.out.WriteVLong(field.sumTotalTermFreq) 921 } 922 if err == nil { 923 if err = w.out.WriteVLong(field.sumDocFreq); err == nil { 924 if err = w.out.WriteVInt(int32(field.docCount)); err == nil { 925 if err = w.out.WriteVInt(int32(field.longsSize)); err == nil { 926 if err = w.indexOut.WriteVLong(field.indexStartFP); err == nil { 927 if err = writeBytesRef(w.out, field.minTerm); err == nil { 928 err = writeBytesRef(w.out, field.maxTerm) 929 } 930 } 931 } 932 } 933 } 934 } 935 } 936 } 937 } 938 } 939 if err == nil { 940 if err = w.writeTrailer(w.out, dirStart); err == nil { 941 if err = codec.WriteFooter(w.out); err == nil { 942 if err = w.writeIndexTrailer(w.indexOut, indexDirStart); err == nil { 943 if err = codec.WriteFooter(w.indexOut); err == nil { 944 success = true 945 } 946 } 947 } 948 } 949 } 950 return 951 } 952 953 func writeBytesRef(out store.IndexOutput, bytes []byte) (err error) { 954 if err = out.WriteVInt(int32(len(bytes))); err == nil { 955 err = out.WriteBytes(bytes) 956 } 957 return 958 }