github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/termsWriter.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/blocktree/termsWriter.go (about)

     1  package blocktree
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/balzaczyy/golucene/core/codec"
     6  	. "github.com/balzaczyy/golucene/core/codec/spi"
     7  	. "github.com/balzaczyy/golucene/core/index/model"
     8  	"github.com/balzaczyy/golucene/core/store"
     9  	"github.com/balzaczyy/golucene/core/util"
    10  	"github.com/balzaczyy/golucene/core/util/fst"
    11  	"github.com/balzaczyy/golucene/core/util/packed"
    12  	"io"
    13  	"math"
    14  	"strings"
    15  )
    16  
    17  // codec/PostingsWriterBase.java
    18  
    19  /*
    20  Extension of PostingsConsumer to support pluggable term dictionaries.
    21  
    22  This class contains additional hooks to interact with the provided
    23  term dictionaries such as BlockTreeTermsWriter. If you want to re-use
    24  an existing implementation and are only interested in customizing the
    25  format of the postings list, extend this class instead.
    26  */
    27  type PostingsWriterBase interface {
    28  	codec.PostingsConsumer
    29  	io.Closer
    30  
    31  	// Called once after startup, before any terms have been added.
    32  	// Implementations typically write a header to the provided termsOut.
    33  	Init(store.IndexOutput) error
    34  	NewTermState() *BlockTermState
    35  	// Start a new term. Note that a matching call to finishTerm() is
    36  	// done, only if the term has at least one document.
    37  	StartTerm() error
    38  	// Finishes the current term. The provided TermStats contains the
    39  	// term's summary statistics.
    40  	FinishTerm(*BlockTermState) error
    41  	EncodeTerm([]int64, util.DataOutput, *FieldInfo, *BlockTermState, bool) error
    42  	// Called when the writing switches to another field.
    43  	SetField(fieldInfo *FieldInfo) int
    44  }
    45  
    46  // codec/BlockTreeTermsWriter.java
    47  const (
    48  	/* Suggested degault value for the minItemsInBlock parameter. */
    49  	DEFAULT_MIN_BLOCK_SIZE = 25
    50  
    51  	/* Suggested default value for the maxItemsInBlock parameter. */
    52  	DEFAULT_MAX_BLOCK_SIZE = 48
    53  
    54  	/* Extension of terms file */
    55  	TERMS_EXTENSION  = "tim"
    56  	TERMS_CODEC_NAME = "BLOCK_TREE_TERMS_DICT"
    57  
    58  	TERMS_VERSION_START = 0
    59  	/* Append-only */
    60  	TERMS_VERSION_APPEND_ONLY   = 1
    61  	TERMS_VERSION_META_ARRAY    = 2
    62  	TERMS_VERSION_CHECKSUM      = 3
    63  	TERMS_VERSION_MIN_MAX_TERMS = 4
    64  	/* Current terms format. */
    65  	TERMS_VERSION_CURRENT = TERMS_VERSION_MIN_MAX_TERMS
    66  
    67  	/* Extension of terms index file */
    68  	TERMS_INDEX_EXTENSION  = "tip"
    69  	TERMS_INDEX_CODEC_NAME = "BLOCK_TREE_TERMS_INDEX"
    70  )
    71  
    72  type BlockTreeTermsWriterSPI interface {
    73  	WriteHeader(store.IndexOutput) error
    74  	WriteIndexHeader(store.IndexOutput) error
    75  }
    76  
    77  type FieldMetaData struct {
    78  	fieldInfo        *FieldInfo
    79  	rootCode         []byte
    80  	numTerms         int64
    81  	indexStartFP     int64
    82  	sumTotalTermFreq int64
    83  	sumDocFreq       int64
    84  	docCount         int
    85  	longsSize        int
    86  	minTerm          []byte
    87  	maxTerm          []byte
    88  }
    89  
    90  func newFieldMetaData(fieldInfo *FieldInfo,
    91  	rootCode []byte, numTerms, indexStartFP, sumTotalTermFreq, sumDocFreq int64,
    92  	docCount, longsSize int, minTerm, maxTerm []byte) *FieldMetaData {
    93  	assert(numTerms > 0)
    94  	assert2(rootCode != nil, "field=%v numTerms=%v", fieldInfo.Name, numTerms)
    95  	return &FieldMetaData{
    96  		fieldInfo,
    97  		rootCode,
    98  		numTerms,
    99  		indexStartFP,
   100  		sumTotalTermFreq,
   101  		sumDocFreq,
   102  		docCount,
   103  		longsSize,
   104  		minTerm,
   105  		maxTerm,
   106  	}
   107  }
   108  
   109  type BlockTreeTermsWriter struct {
   110  	spi BlockTreeTermsWriterSPI
   111  
   112  	out             store.IndexOutput
   113  	indexOut        store.IndexOutput
   114  	maxDoc          int
   115  	minItemsInBlock int
   116  	maxItemsInBlock int
   117  
   118  	postingsWriter PostingsWriterBase
   119  	fieldInfos     FieldInfos
   120  	currentField   *FieldInfo
   121  
   122  	fields  []*FieldMetaData
   123  	segment string
   124  
   125  	scratchBytes   *store.RAMOutputStream
   126  	scratchIntsRef *util.IntsRefBuilder
   127  }
   128  
   129  /*
   130  Create a new writer. The number of items (terms or sub-blocks) per
   131  block will aim tobe between minItermsPerBlock and maxItemsPerBlock,
   132  though in some cases, the blocks may be smaller than the min.
   133  */
   134  func NewBlockTreeTermsWriter(state *SegmentWriteState,
   135  	postingsWriter PostingsWriterBase,
   136  	minItemsInBlock, maxItemsInBlock int) (*BlockTreeTermsWriter, error) {
   137  	assert2(minItemsInBlock >= 2, "minItemsInBlock must be >= 2; got %v", minItemsInBlock)
   138  	assert2(maxItemsInBlock >= 1, "maxItemsInBlock must be >= 1; got %v", maxItemsInBlock)
   139  	assert2(minItemsInBlock <= maxItemsInBlock,
   140  		"maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=%v minItemsInBlock=%v",
   141  		maxItemsInBlock, minItemsInBlock)
   142  	assert2(2*(minItemsInBlock-1) <= maxItemsInBlock,
   143  		"maxItemsInBlock must be at least 2*(minItemsInBlock-1; got maxItemsInBlock=%v minItemsInBlock=%v",
   144  		maxItemsInBlock, minItemsInBlock)
   145  
   146  	ans := &BlockTreeTermsWriter{
   147  		maxDoc:          state.SegmentInfo.DocCount(),
   148  		fieldInfos:      state.FieldInfos,
   149  		minItemsInBlock: minItemsInBlock,
   150  		maxItemsInBlock: maxItemsInBlock,
   151  		postingsWriter:  postingsWriter,
   152  		segment:         state.SegmentInfo.Name,
   153  		scratchBytes:    store.NewRAMOutputStreamBuffer(),
   154  		scratchIntsRef:  util.NewIntsRefBuilder(),
   155  		// bytesWriter:     store.NewRAMOutputStreamBuffer(),
   156  		// bytesWriter2:    store.NewRAMOutputStreamBuffer(),
   157  	}
   158  	ans.spi = ans
   159  	var out, indexOut store.IndexOutput
   160  	if err := func() error {
   161  		var success = false
   162  		defer func() {
   163  			if !success {
   164  				util.CloseWhileSuppressingError(out, indexOut)
   165  			}
   166  		}()
   167  
   168  		var err error
   169  		termsFileName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_EXTENSION)
   170  		if out, err = state.Directory.CreateOutput(termsFileName, state.Context); err != nil {
   171  			return err
   172  		}
   173  		if err = ans.spi.WriteHeader(out); err != nil {
   174  			return err
   175  		}
   176  
   177  		termsIndexFileName := util.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION)
   178  		if indexOut, err = state.Directory.CreateOutput(termsIndexFileName, state.Context); err != nil {
   179  			return err
   180  		}
   181  		if err = ans.spi.WriteIndexHeader(indexOut); err != nil {
   182  			return err
   183  		}
   184  
   185  		// have consumer write its format/header
   186  		if err = postingsWriter.Init(out); err != nil {
   187  			return err
   188  		}
   189  		success = true
   190  		return nil
   191  	}(); err != nil {
   192  		return nil, err
   193  	}
   194  	ans.out = out
   195  	ans.indexOut = indexOut
   196  	return ans, nil
   197  }
   198  
   199  func (w *BlockTreeTermsWriter) WriteHeader(out store.IndexOutput) error {
   200  	return codec.WriteHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT)
   201  }
   202  
   203  func (w *BlockTreeTermsWriter) WriteIndexHeader(out store.IndexOutput) error {
   204  	return codec.WriteHeader(out, TERMS_INDEX_CODEC_NAME, TERMS_VERSION_CURRENT)
   205  }
   206  
   207  /* Writes the terms file trailer. */
   208  func (w *BlockTreeTermsWriter) writeTrailer(out store.IndexOutput, dirStart int64) error {
   209  	return out.WriteLong(dirStart)
   210  }
   211  
   212  /* Writes the index file trailer. */
   213  func (w *BlockTreeTermsWriter) writeIndexTrailer(indexOut store.IndexOutput, dirStart int64) error {
   214  	return indexOut.WriteLong(dirStart)
   215  }
   216  
   217  func (w *BlockTreeTermsWriter) AddField(field *FieldInfo) (TermsConsumer, error) {
   218  	assert(w.currentField == nil || w.currentField.Name < field.Name)
   219  	w.currentField = field
   220  	return newTermsWriter(w, field), nil
   221  }
   222  
   223  func encodeOutput(fp int64, hasTerms bool, isFloor bool) int64 {
   224  	assert(fp < (1 << 62))
   225  	ans := (fp << 2)
   226  	if hasTerms {
   227  		ans |= BTT_OUTPUT_FLAG_HAS_TERMS
   228  	}
   229  	if isFloor {
   230  		ans |= BTT_OUTPUT_FLAG_IS_FLOOR
   231  	}
   232  	return ans
   233  }
   234  
   235  type PendingEntry interface {
   236  	isTerm() bool
   237  }
   238  
   239  type PendingTerm struct {
   240  	term []byte
   241  	// stats + metadata
   242  	state *BlockTermState
   243  }
   244  
   245  func newPendingTerm(term []byte, state *BlockTermState) *PendingTerm {
   246  	clone := make([]byte, len(term))
   247  	copy(clone, term)
   248  	return &PendingTerm{clone, state}
   249  }
   250  
   251  func (t *PendingTerm) isTerm() bool { return true }
   252  
   253  func (t *PendingTerm) String() string { panic("not implemented yet") }
   254  
   255  type PendingBlock struct {
   256  	prefix        []byte
   257  	fp            int64
   258  	index         *fst.FST
   259  	subIndices    []*fst.FST
   260  	hasTerms      bool
   261  	isFloor       bool
   262  	floorLeadByte int
   263  }
   264  
   265  func newPendingBlock(prefix []byte, fp int64, hasTerms, isFloor bool,
   266  	floorLeadByte int, subIndices []*fst.FST) *PendingBlock {
   267  	return &PendingBlock{
   268  		prefix:        prefix,
   269  		fp:            fp,
   270  		index:         nil,
   271  		subIndices:    subIndices,
   272  		hasTerms:      hasTerms,
   273  		isFloor:       isFloor,
   274  		floorLeadByte: floorLeadByte,
   275  	}
   276  }
   277  
   278  func (b *PendingBlock) isTerm() bool { return false }
   279  
   280  func (b *PendingBlock) String() string {
   281  	return fmt.Sprintf("BLOCK: %v", utf8ToString(b.prefix))
   282  }
   283  
   284  func (b *PendingBlock) compileIndex(blocks []*PendingBlock,
   285  	scratchBytes *store.RAMOutputStream,
   286  	scratchIntsRef *util.IntsRefBuilder) (err error) {
   287  
   288  	assert2(b.isFloor && len(blocks) > 1 || (!b.isFloor && len(blocks) == 1),
   289  		"isFloor=%v blocks=%v", b.isFloor, blocks)
   290  	assert(blocks[0] == b)
   291  
   292  	assert(scratchBytes.FilePointer() == 0)
   293  
   294  	// TODO: try writing the leading vLong in MSB order
   295  	// (opposite of what Lucene does today), for better
   296  	// outputs sharing in the FST
   297  	if err = scratchBytes.WriteVLong(encodeOutput(b.fp, b.hasTerms, b.isFloor)); err != nil {
   298  		return
   299  	}
   300  	if b.isFloor {
   301  		if err = scratchBytes.WriteVInt(int32(len(blocks) - 1)); err != nil {
   302  			return
   303  		}
   304  		for _, sub := range blocks[1:] {
   305  			assert(sub.floorLeadByte != -1)
   306  			// fmt.Printf("    write floorLeadByte=%v\n", util.ItoHex(int64(sub.floorLeadByte)))
   307  			if err = scratchBytes.WriteByte(byte(sub.floorLeadByte)); err != nil {
   308  				return
   309  			}
   310  			assert(sub.fp > b.fp)
   311  			if err = scratchBytes.WriteVLong((sub.fp-b.fp)<<1 |
   312  				int64(map[bool]int{true: 1, false: 0}[sub.hasTerms])); err != nil {
   313  				return
   314  			}
   315  		}
   316  	}
   317  
   318  	outputs := fst.ByteSequenceOutputsSingleton()
   319  	indexBuilder := fst.NewBuilder(fst.INPUT_TYPE_BYTE1,
   320  		0, 0, true, false, int(math.MaxInt32),
   321  		outputs, false,
   322  		packed.PackedInts.COMPACT, true, 15)
   323  
   324  	// fmt.Printf("  compile index for prefix=%v\n", b.prefix)
   325  
   326  	bytes := make([]byte, scratchBytes.FilePointer())
   327  	assert(len(bytes) > 0)
   328  	err = scratchBytes.WriteToBytes(bytes)
   329  	if err != nil {
   330  		return err
   331  	}
   332  	err = indexBuilder.Add(fst.ToIntsRef(b.prefix, scratchIntsRef), bytes)
   333  	if err != nil {
   334  		return err
   335  	}
   336  	scratchBytes.Reset()
   337  
   338  	// copy over index for all sub-blocks
   339  	for _, block := range blocks {
   340  		if block.subIndices != nil {
   341  			for _, subIndex := range block.subIndices {
   342  				if err = b.append(indexBuilder, subIndex, scratchIntsRef); err != nil {
   343  					return err
   344  				}
   345  			}
   346  		}
   347  		block.subIndices = nil
   348  	}
   349  
   350  	if b.index, err = indexBuilder.Finish(); err != nil {
   351  		return err
   352  	}
   353  	assert(b.subIndices == nil)
   354  	return nil
   355  }
   356  
   357  func (b *PendingBlock) append(
   358  	builder *fst.Builder,
   359  	subIndex *fst.FST,
   360  	scratchIntsRef *util.IntsRefBuilder) error {
   361  
   362  	subIndexEnum := fst.NewBytesRefFSTEnum(subIndex)
   363  	indexEnt, err := subIndexEnum.Next()
   364  	for err == nil && indexEnt != nil {
   365  		// fmt.Printf("      add sub=%v output=%v\n", indexEnt.Input, indexEnt.Output)
   366  		err = builder.Add(fst.ToIntsRef(indexEnt.Input.ToBytes(), scratchIntsRef), indexEnt.Output)
   367  		if err == nil {
   368  			indexEnt, err = subIndexEnum.Next()
   369  		}
   370  	}
   371  	return err
   372  }
   373  
   374  type TermsWriter struct {
   375  	owner            *BlockTreeTermsWriter
   376  	fieldInfo        *FieldInfo
   377  	longsSize        int
   378  	numTerms         int64
   379  	docsSeen         *util.FixedBitSet
   380  	sumTotalTermFreq int64
   381  	sumDocFreq       int64
   382  	docCount         int
   383  	indexStartFP     int64
   384  
   385  	// Records index into pending where the current prefix at that
   386  	// length "started"; for example, if current term starts with 't',
   387  	// startsByPrefix[0] is the index into pending for the first
   388  	// term/sub-block starting with 't'. We use this to figure out when
   389  	// to write a new block:
   390  	lastTerm     *util.BytesRefBuilder
   391  	prefixStarts []int
   392  
   393  	longs []int64
   394  
   395  	// Pending stack of terms and blocks. As terms arrive (in sorted
   396  	// order) we append to this stack, and once the top of the stak has
   397  	// enough terms starting with a common prefix, we write a new block
   398  	// with those terms and replace those terms in the stack with a new
   399  	// block:
   400  	pending []PendingEntry
   401  
   402  	// Reused in writeBlocks:
   403  	newBlocks []*PendingBlock
   404  
   405  	firstPendingTerm *PendingTerm
   406  	lastPendingTerm  *PendingTerm
   407  
   408  	suffixWriter *store.RAMOutputStream
   409  	statsWriter  *store.RAMOutputStream
   410  	metaWriter   *store.RAMOutputStream
   411  	bytesWriter  *store.RAMOutputStream
   412  }
   413  
   414  func newTermsWriter(owner *BlockTreeTermsWriter,
   415  	fieldInfo *FieldInfo) *TermsWriter {
   416  	owner.postingsWriter.SetField(fieldInfo)
   417  	ans := &TermsWriter{
   418  		owner:        owner,
   419  		fieldInfo:    fieldInfo,
   420  		lastTerm:     util.NewBytesRefBuilder(),
   421  		prefixStarts: make([]int, 8),
   422  		suffixWriter: store.NewRAMOutputStreamBuffer(),
   423  		statsWriter:  store.NewRAMOutputStreamBuffer(),
   424  		metaWriter:   store.NewRAMOutputStreamBuffer(),
   425  		bytesWriter:  store.NewRAMOutputStreamBuffer(),
   426  	}
   427  	ans.longsSize = owner.postingsWriter.SetField(fieldInfo)
   428  	ans.longs = make([]int64, ans.longsSize)
   429  	return ans
   430  }
   431  
   432  /* Writes the top count entries in pending, using prevTerm to compute the prefix. */
   433  func (w *TermsWriter) writeBlocks(prefixLength, count int) (err error) {
   434  	assert(count > 0)
   435  
   436  	// Root block better writes all remaining pending entries:
   437  	assert(prefixLength > 0 || count == len(w.pending))
   438  
   439  	lastSuffixLeadLabel := -1
   440  
   441  	// True if we saw at least one term in this block (we record if a
   442  	// block only points to sub-blocks in the terms index so we can
   443  	// avoid seeking to it when we are looking for a term):
   444  	hasTerms := false
   445  	hasSubBlocks := false
   446  
   447  	end := len(w.pending)
   448  	start := end - count
   449  	nextBlockStart := start
   450  	nextFloorLeadLabel := -1
   451  
   452  	for i, ent := range w.pending[start:] {
   453  		var suffixLeadLabel int
   454  		if ent.isTerm() {
   455  			term := ent.(*PendingTerm)
   456  			if len(term.term) == prefixLength {
   457  				// suffix is 0, ie prefix 'foo' and term is 'foo' so the
   458  				// term has empty string suffix in this block
   459  				assert(lastSuffixLeadLabel == -1)
   460  				suffixLeadLabel = -1
   461  			} else {
   462  				suffixLeadLabel = int(term.term[prefixLength])
   463  			}
   464  		} else {
   465  			block := ent.(*PendingBlock)
   466  			assert(len(block.prefix) > prefixLength)
   467  			suffixLeadLabel = int(block.prefix[prefixLength])
   468  		}
   469  
   470  		if suffixLeadLabel != lastSuffixLeadLabel {
   471  			if itemsInBlock := i + count - nextBlockStart; itemsInBlock >= w.owner.minItemsInBlock &&
   472  				end-nextBlockStart > w.owner.maxItemsInBlock {
   473  				// The count is too large for one block, so we must break
   474  				// it into "floor" blocks, where we record the leading
   475  				// label of the suffix of the first term in each floor
   476  				// block, so at search time we can jump to the right floor
   477  				// block. We just use a naive greedy segmenter here: make a
   478  				// new floor block as soon as we have at least
   479  				// minItemsInBlock. This is not always best: it often
   480  				// produces a too-small block as the final block:
   481  				isFloor := itemsInBlock < count
   482  				var block *PendingBlock
   483  				if block, err = w.writeBlock(prefixLength, isFloor,
   484  					nextFloorLeadLabel, nextBlockStart, i+count, hasTerms,
   485  					hasSubBlocks); err != nil {
   486  					return
   487  				}
   488  				w.newBlocks = append(w.newBlocks, block)
   489  
   490  				hasTerms = false
   491  				hasSubBlocks = false
   492  				nextFloorLeadLabel = suffixLeadLabel
   493  				nextBlockStart = i + count
   494  			}
   495  
   496  			lastSuffixLeadLabel = suffixLeadLabel
   497  		}
   498  
   499  		if ent.isTerm() {
   500  			hasTerms = true
   501  		} else {
   502  			hasSubBlocks = true
   503  		}
   504  	}
   505  
   506  	// Write last block, if any:
   507  	if nextBlockStart < end {
   508  		itemsInBlock := end - nextBlockStart
   509  		isFloor := itemsInBlock < count
   510  		var block *PendingBlock
   511  		if block, err = w.writeBlock(prefixLength, isFloor,
   512  			nextFloorLeadLabel, nextBlockStart, end, hasTerms,
   513  			hasSubBlocks); err != nil {
   514  			return
   515  		}
   516  		w.newBlocks = append(w.newBlocks, block)
   517  	}
   518  
   519  	assert(len(w.newBlocks) > 0)
   520  
   521  	firstBlock := w.newBlocks[0]
   522  
   523  	assert(firstBlock.isFloor || len(w.newBlocks) == 1)
   524  
   525  	if err = firstBlock.compileIndex(w.newBlocks,
   526  		w.owner.scratchBytes, w.owner.scratchIntsRef); err != nil {
   527  		return
   528  	}
   529  
   530  	// Remove slice from the top of the pending stack, that we just wrote:
   531  	w.pending = w.pending[:start]
   532  
   533  	// Append new block
   534  	w.pending = append(w.pending, firstBlock)
   535  
   536  	w.newBlocks = nil
   537  	return nil
   538  }
   539  
   540  /*
   541  Writes the specified slice (start is inclusive, end is exclusive)
   542  from pending stack as a new block. If isFloor is true, there were too
   543  many (more than maxItemsInBlock) entries sharing the same prefix, and
   544  so we broke it into multiple floor blocks where we record the
   545  starting lable of the suffix of each floor block.
   546  */
   547  func (w *TermsWriter) writeBlock(
   548  	prefixLength int,
   549  	isFloor bool,
   550  	floorLeadLabel, start, end int,
   551  	hasTerms, hasSubBlocks bool) (*PendingBlock, error) {
   552  
   553  	assert(end > start)
   554  
   555  	startFP := w.owner.out.FilePointer()
   556  
   557  	hasFloorLeadLabel := isFloor && floorLeadLabel != -1
   558  
   559  	prefix := make([]byte, prefixLength)
   560  	copy(prefix, w.lastTerm.Bytes()[:prefixLength])
   561  
   562  	// write block header:
   563  	numEntries := end - start
   564  	code := numEntries << 1
   565  	if end == len(w.pending) { // last block
   566  		code |= 1
   567  	}
   568  	var err error
   569  	if err = w.owner.out.WriteVInt(int32(code)); err != nil {
   570  		return nil, err
   571  	}
   572  
   573  	// fmt.Printf("  writeBlock %vseg=%v len(pending)=%v prefixLength=%v "+
   574  	// 	"indexPrefix=%v entCount=%v startFP=%v futureTermCount=%v%v "+
   575  	// 	"isLastInFloor=%v\n",
   576  	// 	map[bool]string{true: "(floor) "}[isFloor],
   577  	// 	w.owner.segment,
   578  	// 	len(w.pending),
   579  	// 	prefixLength,
   580  	// 	prefix,
   581  	// 	length,
   582  	// 	startFP,
   583  	// 	futureTermCount,
   584  	// 	map[bool]string{true: fmt.Sprintf(" floorLeadByte=%v", strconv.FormatInt(int64(floorLeadByte&0xff), 16))}[isFloor],
   585  	// 	isLastInFloor,
   586  	// )
   587  
   588  	// 1st pass: pack term suffix bytes into []byte blob
   589  	// TODO: cutover to bulk int codec... simple64?
   590  
   591  	// We optimize the leaf block case (block has only terms), writing
   592  	// a more compact format in this case:
   593  	isLeafBlock := !hasSubBlocks
   594  
   595  	var subIndices []*fst.FST
   596  
   597  	var absolute = true
   598  
   599  	if isLeafBlock { // only terms
   600  		subIndices = nil
   601  		for i, ent := range w.pending[start:end] {
   602  			assert2(ent.isTerm(), "i=%v", i+start)
   603  
   604  			term := ent.(*PendingTerm)
   605  			assert2(strings.HasPrefix(string(term.term), string(prefix)), "term.term=%v prefix=%v", term.term, prefix)
   606  			state := term.state
   607  			suffix := len(term.term) - prefixLength
   608  			// for leaf block we write suffix straight
   609  			if err = w.suffixWriter.WriteVInt(int32(suffix)); err != nil {
   610  				return nil, err
   611  			}
   612  			if err = w.suffixWriter.WriteBytes(term.term[prefixLength : prefixLength+suffix]); err != nil {
   613  				return nil, err
   614  			}
   615  			assert(floorLeadLabel == -1 || int(term.term[prefixLength]) >= floorLeadLabel)
   616  
   617  			// write term stats, to separate []byte blob:
   618  			if err = w.statsWriter.WriteVInt(int32(state.DocFreq)); err != nil {
   619  				return nil, err
   620  			}
   621  			if w.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY {
   622  				assert2(state.TotalTermFreq >= int64(state.DocFreq),
   623  					"%v vs %v", state.TotalTermFreq, state.DocFreq)
   624  				if err := w.statsWriter.WriteVLong(state.TotalTermFreq - int64(state.DocFreq)); err != nil {
   625  					return nil, err
   626  				}
   627  			}
   628  
   629  			// Write term meta data
   630  			if err = w.owner.postingsWriter.EncodeTerm(w.longs, w.bytesWriter, w.fieldInfo, state, absolute); err != nil {
   631  				return nil, err
   632  			}
   633  			for _, v := range w.longs[:w.longsSize] {
   634  				assert(v >= 0)
   635  				if err = w.metaWriter.WriteVLong(v); err != nil {
   636  					return nil, err
   637  				}
   638  			}
   639  			if err = w.bytesWriter.WriteTo(w.metaWriter); err != nil {
   640  				return nil, err
   641  			}
   642  			w.bytesWriter.Reset()
   643  			absolute = false
   644  		}
   645  
   646  	} else { // mixed terms and sub-blocks
   647  		subIndices = nil
   648  		for _, ent := range w.pending[start:end] {
   649  			if ent.isTerm() {
   650  				term := ent.(*PendingTerm)
   651  				assert2(strings.HasPrefix(string(term.term), string(prefix)), "term.term=%v prefix=%v", term.term, prefix)
   652  				state := term.state
   653  				suffix := len(term.term) - prefixLength
   654  				// for non-leaf block we borrow 1 bit to record
   655  				// if entr is term or sub-block
   656  				if err = w.suffixWriter.WriteVInt(int32(suffix << 1)); err != nil {
   657  					return nil, err
   658  				}
   659  				if err = w.suffixWriter.WriteBytes(term.term[prefixLength : prefixLength+suffix]); err != nil {
   660  					return nil, err
   661  				}
   662  				assert(floorLeadLabel == -1 || int(term.term[prefixLength]) >= floorLeadLabel)
   663  
   664  				// write term stats, to separate []byte block:
   665  				if err = w.statsWriter.WriteVInt(int32(state.DocFreq)); err != nil {
   666  					return nil, err
   667  				}
   668  				if w.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY {
   669  					assert(state.TotalTermFreq >= int64(state.DocFreq))
   670  					if err = w.statsWriter.WriteVLong(state.TotalTermFreq - int64(state.DocFreq)); err != nil {
   671  						return nil, err
   672  					}
   673  				}
   674  
   675  				// write term meta data
   676  				if err = w.owner.postingsWriter.EncodeTerm(w.longs, w.bytesWriter, w.fieldInfo, state, absolute); err != nil {
   677  					return nil, err
   678  				}
   679  				for _, v := range w.longs[:w.longsSize] {
   680  					assert(v >= 0)
   681  					if err = w.metaWriter.WriteVLong(v); err != nil {
   682  						return nil, err
   683  					}
   684  				}
   685  				if err = w.bytesWriter.WriteTo(w.metaWriter); err != nil {
   686  					return nil, err
   687  				}
   688  				w.bytesWriter.Reset()
   689  				absolute = false
   690  
   691  			} else {
   692  				block := ent.(*PendingBlock)
   693  				assert(strings.HasPrefix(string(block.prefix), string(prefix)))
   694  				suffix := len(block.prefix) - prefixLength
   695  
   696  				assert(suffix > 0)
   697  
   698  				// for non-leaf block we borrow 1 bit to record if entry is
   699  				// term or sub-block
   700  				if err = w.suffixWriter.WriteVInt(int32((suffix << 1) | 1)); err != nil {
   701  					return nil, err
   702  				}
   703  				if err = w.suffixWriter.WriteBytes(block.prefix[prefixLength : prefixLength+suffix]); err != nil {
   704  					return nil, err
   705  				}
   706  
   707  				assert(floorLeadLabel == -1 || int(block.prefix[prefixLength]) >= floorLeadLabel)
   708  
   709  				assert(block.fp < startFP)
   710  
   711  				if err = w.suffixWriter.WriteVLong(startFP - block.fp); err != nil {
   712  					return nil, err
   713  				}
   714  				subIndices = append(subIndices, block.index)
   715  			}
   716  		}
   717  
   718  		assert(len(subIndices) != 0)
   719  	}
   720  
   721  	// TODO: we could block-write the term suffix pointer
   722  	// this would take more space but would enable binary
   723  	// search on lookup
   724  
   725  	// write suffixes []byte blob to terms dict output:
   726  	if err = w.owner.out.WriteVInt(
   727  		int32(w.suffixWriter.FilePointer()<<1) |
   728  			(map[bool]int32{true: 1, false: 0}[isLeafBlock])); err != nil {
   729  		return nil, err
   730  	}
   731  	if err = w.suffixWriter.WriteTo(w.owner.out); err != nil {
   732  		return nil, err
   733  	}
   734  	w.suffixWriter.Reset()
   735  
   736  	// write term stats []byte blob
   737  	if err = w.owner.out.WriteVInt(int32(w.statsWriter.FilePointer())); err != nil {
   738  		return nil, err
   739  	}
   740  	if err = w.statsWriter.WriteTo(w.owner.out); err != nil {
   741  		return nil, err
   742  	}
   743  	w.statsWriter.Reset()
   744  
   745  	// Write term meta data []byte blob
   746  	if err = w.owner.out.WriteVInt(int32(w.metaWriter.FilePointer())); err != nil {
   747  		return nil, err
   748  	}
   749  	if err = w.metaWriter.WriteTo(w.owner.out); err != nil {
   750  		return nil, err
   751  	}
   752  	w.metaWriter.Reset()
   753  
   754  	if hasFloorLeadLabel {
   755  		prefix = append(prefix, byte(floorLeadLabel))
   756  	}
   757  
   758  	return newPendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices), nil
   759  }
   760  
   761  func (w *TermsWriter) Comparator() func(a, b []byte) bool {
   762  	return util.UTF8SortedAsUnicodeLess
   763  }
   764  
   765  func (w *TermsWriter) StartTerm(text []byte) (codec.PostingsConsumer, error) {
   766  	assert(w.owner != nil)
   767  	assert(w.owner.postingsWriter != nil)
   768  	err := w.owner.postingsWriter.StartTerm()
   769  	return w.owner.postingsWriter, err
   770  }
   771  
   772  func (w *TermsWriter) FinishTerm(text []byte, stats *codec.TermStats) (err error) {
   773  	assert(stats.DocFreq > 0)
   774  	// fmt.Printf("BTTW.finishTerm term=%v:%v seg=%v df=%v\n",
   775  	// w.fieldInfo.Name, utf8ToString(text), w.owner.segment, stats.DocFreq)
   776  
   777  	assert2(w.fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY ||
   778  		stats.TotalTermFreq >= int64(stats.DocFreq),
   779  		"postingsWriter=%v", w.owner.postingsWriter)
   780  	state := w.owner.postingsWriter.NewTermState()
   781  	state.DocFreq = stats.DocFreq
   782  	state.TotalTermFreq = stats.TotalTermFreq
   783  	if err = w.owner.postingsWriter.FinishTerm(state); err != nil {
   784  		return
   785  	}
   786  
   787  	w.sumDocFreq += int64(state.DocFreq)
   788  	w.sumTotalTermFreq += state.TotalTermFreq
   789  	if err = w.pushTerm(text); err != nil {
   790  		return
   791  	}
   792  
   793  	term := newPendingTerm(text, state)
   794  	w.pending = append(w.pending, term)
   795  	w.numTerms++
   796  	if w.firstPendingTerm == nil {
   797  		w.firstPendingTerm = term
   798  	}
   799  	w.lastPendingTerm = term
   800  	return nil
   801  }
   802  
   803  /* Pushes the new term to the top of the stack, and writes new blocks. */
   804  func (w *TermsWriter) pushTerm(text []byte) error {
   805  	limit := w.lastTerm.Length()
   806  	if len(text) < limit {
   807  		limit = len(text)
   808  	}
   809  
   810  	// Find common prefix between last term and current term:
   811  	pos := 0
   812  	for pos < limit && w.lastTerm.At(pos) == text[pos] {
   813  		pos++
   814  	}
   815  
   816  	// Close the "abandoned" suffix now:
   817  	for i := w.lastTerm.Length() - 1; i >= pos; i-- {
   818  		// How many items on top of the stack share the current suffix
   819  		// we are closing:
   820  		if prefixTopSize := len(w.pending) - w.prefixStarts[i]; prefixTopSize >= w.owner.minItemsInBlock {
   821  			if err := w.writeBlocks(i+1, prefixTopSize); err != nil {
   822  				return err
   823  			}
   824  			w.prefixStarts[i] -= prefixTopSize - 1
   825  		}
   826  	}
   827  
   828  	if len(w.prefixStarts) < len(text) {
   829  		w.prefixStarts = util.GrowIntSlice(w.prefixStarts, len(text))
   830  	}
   831  
   832  	// Init new tail:
   833  	for i := pos; i < len(text); i++ {
   834  		w.prefixStarts[i] = len(w.pending)
   835  	}
   836  
   837  	w.lastTerm.Copy(text)
   838  	return nil
   839  }
   840  
   841  func (w *TermsWriter) Finish(sumTotalTermFreq, sumDocFreq int64, docCount int) (err error) {
   842  	if w.numTerms > 0 {
   843  		// Add empty term to force closing of all final blocks:
   844  		w.pushTerm(nil)
   845  
   846  		// TODO: if len(pending) is already 1 with a non-zero prefix length
   847  		// we can save writing a "degenerate" root block, but we have to
   848  		// fix all the palces that assume the root blocks' prefix is the empty string:
   849  		if err = w.writeBlocks(0, len(w.pending)); err != nil {
   850  			return err
   851  		}
   852  
   853  		// we better have one final "root" block:
   854  		assert2(len(w.pending) == 1 && !w.pending[0].isTerm(),
   855  			"len(pending) = %v pending=%v", len(w.pending), w.pending)
   856  		root := w.pending[0].(*PendingBlock)
   857  		assert2(len(root.prefix) == 0, "%v", root.prefix)
   858  		assert(root.index.EmptyOutput() != nil)
   859  
   860  		w.sumTotalTermFreq = sumTotalTermFreq
   861  		w.sumDocFreq = sumDocFreq
   862  		w.docCount = docCount
   863  
   864  		// Write FST to index
   865  		w.indexStartFP = w.owner.indexOut.FilePointer()
   866  		err = root.index.Save(w.owner.indexOut)
   867  		if err != nil {
   868  			return err
   869  		}
   870  		// fmt.Printf("  write FST %v field=%v\n", w.indexStartFP, w.fieldInfo.Name)
   871  
   872  		assert(w.firstPendingTerm != nil)
   873  		minTerm := w.firstPendingTerm.term
   874  		assert(w.lastPendingTerm != nil)
   875  		maxTerm := w.lastPendingTerm.term
   876  
   877  		w.owner.fields = append(w.owner.fields, newFieldMetaData(
   878  			w.fieldInfo,
   879  			w.pending[0].(*PendingBlock).index.EmptyOutput().([]byte),
   880  			w.numTerms,
   881  			w.indexStartFP,
   882  			sumTotalTermFreq,
   883  			sumDocFreq,
   884  			docCount,
   885  			w.longsSize,
   886  			minTerm, maxTerm))
   887  	} else {
   888  		assert(sumTotalTermFreq == 0 || w.fieldInfo.IndexOptions() == INDEX_OPT_DOCS_ONLY && sumTotalTermFreq == -1)
   889  		assert(sumDocFreq == 0)
   890  		assert(docCount == 0)
   891  	}
   892  	return nil
   893  }
   894  
   895  func (w *BlockTreeTermsWriter) Close() (err error) {
   896  	var success = false
   897  	defer func() {
   898  		if success {
   899  			util.Close(w.out, w.indexOut, w.postingsWriter)
   900  		} else {
   901  			util.CloseWhileSuppressingError(w.out, w.indexOut, w.postingsWriter)
   902  		}
   903  	}()
   904  
   905  	dirStart := w.out.FilePointer()
   906  	indexDirStart := w.indexOut.FilePointer()
   907  
   908  	if err = w.out.WriteVInt(int32(len(w.fields))); err != nil {
   909  		return
   910  	}
   911  
   912  	for _, field := range w.fields {
   913  		// fmt.Printf("  field %v %v terms\n", field.fieldInfo.Name, field.numTerms)
   914  		if err = w.out.WriteVInt(field.fieldInfo.Number); err == nil {
   915  			assert(field.numTerms > 0)
   916  			if err = w.out.WriteVLong(field.numTerms); err == nil {
   917  				if err = w.out.WriteVInt(int32(len(field.rootCode))); err == nil {
   918  					err = w.out.WriteBytes(field.rootCode)
   919  					if err == nil && field.fieldInfo.IndexOptions() != INDEX_OPT_DOCS_ONLY {
   920  						err = w.out.WriteVLong(field.sumTotalTermFreq)
   921  					}
   922  					if err == nil {
   923  						if err = w.out.WriteVLong(field.sumDocFreq); err == nil {
   924  							if err = w.out.WriteVInt(int32(field.docCount)); err == nil {
   925  								if err = w.out.WriteVInt(int32(field.longsSize)); err == nil {
   926  									if err = w.indexOut.WriteVLong(field.indexStartFP); err == nil {
   927  										if err = writeBytesRef(w.out, field.minTerm); err == nil {
   928  											err = writeBytesRef(w.out, field.maxTerm)
   929  										}
   930  									}
   931  								}
   932  							}
   933  						}
   934  					}
   935  				}
   936  			}
   937  		}
   938  	}
   939  	if err == nil {
   940  		if err = w.writeTrailer(w.out, dirStart); err == nil {
   941  			if err = codec.WriteFooter(w.out); err == nil {
   942  				if err = w.writeIndexTrailer(w.indexOut, indexDirStart); err == nil {
   943  					if err = codec.WriteFooter(w.indexOut); err == nil {
   944  						success = true
   945  					}
   946  				}
   947  			}
   948  		}
   949  	}
   950  	return
   951  }
   952  
   953  func writeBytesRef(out store.IndexOutput, bytes []byte) (err error) {
   954  	if err = out.WriteVInt(int32(len(bytes))); err == nil {
   955  		err = out.WriteBytes(bytes)
   956  	}
   957  	return
   958  }