github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene41/postingsWriter.go (about)

     1  package lucene41
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"github.com/balzaczyy/golucene/core/codec"
     7  	. "github.com/balzaczyy/golucene/core/codec/spi"
     8  	. "github.com/balzaczyy/golucene/core/index/model"
     9  	"github.com/balzaczyy/golucene/core/store"
    10  	"github.com/balzaczyy/golucene/core/util"
    11  	"github.com/balzaczyy/golucene/core/util/packed"
    12  	"reflect"
    13  )
    14  
    15  // Lucene41PostingsWriter.java
    16  
    17  /*
    18  Expert: the maximum number of skip levels. Smaller values result in
    19  slightly smaller indexes, but slower skipping in big posting lists.
    20  */
    21  const maxSkipLevels = 10
    22  
    23  const (
    24  	LUCENE41_TERMS_CODEC = "Lucene41PostingsWriterTerms"
    25  	LUCENE41_DOC_CODEC   = "Lucene41PostingsWriterDoc"
    26  	LUCENE41_POS_CODEC   = "Lucene41PostingsWriterPos"
    27  	LUCENE41_PAY_CODEC   = "Lucene41PostingsWriterPay"
    28  
    29  	LUCENE41_VERSION_START      = 0
    30  	LUCENE41_VERSION_META_ARRAY = 1
    31  	LUCENE41_VERSION_CHECKSUM   = 2
    32  	LUCENE41_VERSION_CURRENT    = LUCENE41_VERSION_CHECKSUM
    33  )
    34  
    35  /*
    36  Concrete class that writes docId (maybe frq,pos,offset,payloads) list
    37  with postings format.
    38  
    39  Postings list for each term will be stored separately.
    40  */
    41  type Lucene41PostingsWriter struct {
    42  	docOut store.IndexOutput
    43  	posOut store.IndexOutput
    44  	payOut store.IndexOutput
    45  
    46  	lastState *intBlockTermState
    47  
    48  	fieldHasFreqs     bool
    49  	fieldHasPositions bool
    50  	fieldHasOffsets   bool
    51  	fieldHasPayloads  bool
    52  
    53  	// Holds starting file pointers for current term:
    54  	docStartFP int64
    55  	posStartFP int64
    56  	payStartFP int64
    57  
    58  	docDeltaBuffer []int
    59  	freqBuffer     []int
    60  	docBufferUpto  int
    61  
    62  	posDeltaBuffer         []int
    63  	payloadLengthBuffer    []int
    64  	offsetStartDeltaBuffer []int
    65  	offsetLengthBuffer     []int
    66  	posBufferUpto          int
    67  
    68  	payloadBytes    []byte
    69  	payloadByteUpto int
    70  
    71  	lastBlockDocId           int
    72  	lastBlockPosFP           int64
    73  	lastBlockPayFP           int64
    74  	lastBlockPosBufferUpto   int
    75  	lastBlockPayloadByteUpto int
    76  
    77  	lastDocId       int
    78  	lastPosition    int
    79  	lastStartOffset int
    80  	docCount        int
    81  
    82  	encoded []byte
    83  
    84  	forUtil    *ForUtil
    85  	skipWriter *SkipWriter
    86  }
    87  
    88  /* Creates a postings writer with the specified PackedInts overhead ratio */
    89  func newLucene41PostingsWriter(state *SegmentWriteState,
    90  	accetableOverheadRatio float32) (*Lucene41PostingsWriter, error) {
    91  	docOut, err := state.Directory.CreateOutput(
    92  		util.SegmentFileName(state.SegmentInfo.Name,
    93  			state.SegmentSuffix,
    94  			LUCENE41_DOC_EXTENSION),
    95  		state.Context)
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  
   100  	ans := new(Lucene41PostingsWriter)
   101  	if err = func() error {
   102  		var posOut store.IndexOutput
   103  		var payOut store.IndexOutput
   104  		var success = false
   105  		defer func() {
   106  			if !success {
   107  				util.CloseWhileSuppressingError(docOut, posOut, payOut)
   108  			}
   109  		}()
   110  
   111  		err := codec.WriteHeader(docOut, LUCENE41_DOC_CODEC, LUCENE41_VERSION_CURRENT)
   112  		if err != nil {
   113  			return err
   114  		}
   115  		ans.forUtil, err = NewForUtilInto(accetableOverheadRatio, docOut)
   116  		if err != nil {
   117  			return err
   118  		}
   119  		if state.FieldInfos.HasProx {
   120  			ans.posDeltaBuffer = make([]int, MAX_DATA_SIZE)
   121  			posOut, err = state.Directory.CreateOutput(util.SegmentFileName(
   122  				state.SegmentInfo.Name, state.SegmentSuffix, LUCENE41_POS_EXTENSION),
   123  				state.Context)
   124  			if err != nil {
   125  				return err
   126  			}
   127  
   128  			err = codec.WriteHeader(posOut, LUCENE41_POS_CODEC, LUCENE41_VERSION_CURRENT)
   129  			if err != nil {
   130  				return err
   131  			}
   132  
   133  			if state.FieldInfos.HasPayloads {
   134  				ans.payloadBytes = make([]byte, 128)
   135  				ans.payloadLengthBuffer = make([]int, MAX_DATA_SIZE)
   136  			}
   137  
   138  			if state.FieldInfos.HasOffsets {
   139  				ans.offsetStartDeltaBuffer = make([]int, MAX_DATA_SIZE)
   140  				ans.offsetLengthBuffer = make([]int, MAX_DATA_SIZE)
   141  			}
   142  
   143  			if state.FieldInfos.HasPayloads || state.FieldInfos.HasOffsets {
   144  				payOut, err = state.Directory.CreateOutput(util.SegmentFileName(
   145  					state.SegmentInfo.Name, state.SegmentSuffix, LUCENE41_PAY_EXTENSION),
   146  					state.Context)
   147  				if err != nil {
   148  					return err
   149  				}
   150  				err = codec.WriteHeader(payOut, LUCENE41_PAY_CODEC, LUCENE41_VERSION_CURRENT)
   151  			}
   152  		}
   153  		ans.payOut, ans.posOut = payOut, posOut
   154  		ans.docOut = docOut
   155  		success = true
   156  		return nil
   157  	}(); err != nil {
   158  		return nil, err
   159  	}
   160  
   161  	ans.docDeltaBuffer = make([]int, MAX_DATA_SIZE)
   162  	ans.freqBuffer = make([]int, MAX_DATA_SIZE)
   163  	ans.encoded = make([]byte, MAX_ENCODED_SIZE)
   164  
   165  	// TODO: should we try skipping every 2/4 blocks...?
   166  	ans.skipWriter = NewSkipWriter(
   167  		maxSkipLevels,
   168  		LUCENE41_BLOCK_SIZE,
   169  		state.SegmentInfo.DocCount(),
   170  		ans.docOut,
   171  		ans.posOut,
   172  		ans.payOut)
   173  
   174  	return ans, nil
   175  }
   176  
   177  /* Creates a postings writer with PackedInts.COMPACT */
   178  func newLucene41PostingsWriterCompact(state *SegmentWriteState) (*Lucene41PostingsWriter, error) {
   179  	return newLucene41PostingsWriter(state, packed.PackedInts.COMPACT)
   180  }
   181  
   182  type intBlockTermState struct {
   183  	*BlockTermState
   184  	docStartFP         int64
   185  	posStartFP         int64
   186  	payStartFP         int64
   187  	skipOffset         int64
   188  	lastPosBlockOffset int64
   189  	// docid when there is a single pulsed posting, otherwise -1
   190  	// freq is always implicitly totalTermFreq in this case.
   191  	singletonDocID int
   192  }
   193  
   194  var emptyState = newIntBlockTermState()
   195  
   196  func newIntBlockTermState() *intBlockTermState {
   197  	ts := &intBlockTermState{
   198  		skipOffset:         -1,
   199  		lastPosBlockOffset: -1,
   200  		singletonDocID:     -1,
   201  	}
   202  	parent := NewBlockTermState()
   203  	ts.BlockTermState, parent.Self = parent, ts
   204  	return ts
   205  }
   206  
   207  func (ts *intBlockTermState) Clone() TermState {
   208  	clone := newIntBlockTermState()
   209  	clone.CopyFrom(ts)
   210  	return clone
   211  }
   212  
   213  func (ts *intBlockTermState) CopyFrom(other TermState) {
   214  	assert(other != nil)
   215  	if ots, ok := other.(*intBlockTermState); ok {
   216  		ts.BlockTermState.CopyFrom_(ots.BlockTermState)
   217  		ts.docStartFP = ots.docStartFP
   218  		ts.posStartFP = ots.posStartFP
   219  		ts.payStartFP = ots.payStartFP
   220  		ts.lastPosBlockOffset = ots.lastPosBlockOffset
   221  		ts.skipOffset = ots.skipOffset
   222  		ts.singletonDocID = ots.singletonDocID
   223  	} else {
   224  		panic(fmt.Sprintf("Can not copy from %v", reflect.TypeOf(other).Name()))
   225  	}
   226  }
   227  
   228  func (ts *intBlockTermState) String() string {
   229  	return fmt.Sprintf("%v docStartFP=%v posStartFP=%v payStartFP=%v lastPosBlockOffset=%v skipOffset=%v singletonDocID=%v",
   230  		ts.BlockTermState, ts.docStartFP, ts.posStartFP, ts.payStartFP, ts.lastPosBlockOffset, ts.skipOffset, ts.singletonDocID)
   231  }
   232  
   233  func (w *Lucene41PostingsWriter) NewTermState() *BlockTermState {
   234  	return newIntBlockTermState().BlockTermState
   235  }
   236  
   237  func (w *Lucene41PostingsWriter) Init(termsOut store.IndexOutput) error {
   238  	err := codec.WriteHeader(termsOut, LUCENE41_TERMS_CODEC, LUCENE41_VERSION_CURRENT)
   239  	if err == nil {
   240  		err = termsOut.WriteVInt(LUCENE41_BLOCK_SIZE)
   241  	}
   242  	return err
   243  }
   244  
   245  func (w *Lucene41PostingsWriter) SetField(fieldInfo *FieldInfo) int {
   246  	n := int(fieldInfo.IndexOptions())
   247  	w.fieldHasFreqs = n >= int(INDEX_OPT_DOCS_AND_FREQS)
   248  	w.fieldHasPositions = n >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS)
   249  	w.fieldHasOffsets = n >= int(INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
   250  	w.fieldHasPayloads = fieldInfo.HasPayloads()
   251  	w.skipWriter.SetField(w.fieldHasPositions, w.fieldHasOffsets, w.fieldHasPayloads)
   252  	w.lastState = emptyState
   253  	if w.fieldHasPositions {
   254  		if w.fieldHasPayloads || w.fieldHasOffsets {
   255  			return 3 // doc + pos + pay FP
   256  		} else {
   257  			return 2 // doc + pos FP
   258  		}
   259  	} else {
   260  		return 1 // doc FP
   261  	}
   262  }
   263  
   264  func (w *Lucene41PostingsWriter) StartTerm() error {
   265  	w.docStartFP = w.docOut.FilePointer()
   266  	if w.fieldHasPositions {
   267  		w.posStartFP = w.posOut.FilePointer()
   268  		if w.fieldHasPayloads || w.fieldHasOffsets {
   269  			w.payStartFP = w.payOut.FilePointer()
   270  		}
   271  	}
   272  	w.lastDocId = 0
   273  	w.lastBlockDocId = -1
   274  	w.skipWriter.ResetSkip()
   275  	return nil
   276  }
   277  
   278  func (w *Lucene41PostingsWriter) StartDoc(docId, termDocFreq int) error {
   279  	// Have collected a block of docs, and get a new doc. Should write
   280  	// skip data as well as postings list for current block.
   281  	if w.lastBlockDocId != -1 && w.docBufferUpto == 0 {
   282  		if err := w.skipWriter.BufferSkip(w.lastBlockDocId, w.docCount,
   283  			w.lastBlockPosFP, w.lastBlockPayFP, w.lastBlockPosBufferUpto,
   284  			w.lastBlockPayloadByteUpto); err != nil {
   285  			return err
   286  		}
   287  	}
   288  
   289  	docDelta := docId - w.lastDocId
   290  	if docId < 0 || (w.docCount > 0 && docDelta <= 0) {
   291  		return errors.New(fmt.Sprintf(
   292  			"docs out of order (%v <= %v) (docOut : %v)",
   293  			docId, w.lastDocId, w.docOut))
   294  	}
   295  	w.docDeltaBuffer[w.docBufferUpto] = docDelta
   296  	if w.fieldHasFreqs {
   297  		w.freqBuffer[w.docBufferUpto] = termDocFreq
   298  	}
   299  	w.docBufferUpto++
   300  	w.docCount++
   301  
   302  	if w.docBufferUpto == LUCENE41_BLOCK_SIZE {
   303  		if err := w.forUtil.writeBlock(w.docDeltaBuffer, w.encoded, w.docOut); err != nil {
   304  			return err
   305  		}
   306  		if w.fieldHasFreqs {
   307  			if err := w.forUtil.writeBlock(w.freqBuffer, w.encoded, w.docOut); err != nil {
   308  				return err
   309  			}
   310  		}
   311  		// NOTE: don't set docBufferUpto back to 0 here; finishDoc will
   312  		// do so (because it needs to see that the block was filled so it
   313  		// can save skip data)
   314  	}
   315  
   316  	w.lastDocId = docId
   317  	w.lastPosition = 0
   318  	w.lastStartOffset = 0
   319  	return nil
   320  }
   321  
   322  /* Add a new opsition & payload */
   323  func (w *Lucene41PostingsWriter) AddPosition(position int,
   324  	payload []byte, startOffset, endOffset int) error {
   325  
   326  	w.posDeltaBuffer[w.posBufferUpto] = position - w.lastPosition
   327  	if w.fieldHasPayloads {
   328  		if len(payload) == 0 {
   329  			// no paylaod
   330  			w.payloadLengthBuffer[w.posBufferUpto] = 0
   331  		} else {
   332  			panic("not implemented yet")
   333  		}
   334  	}
   335  
   336  	if w.fieldHasOffsets {
   337  		panic("not implemented yet")
   338  	}
   339  
   340  	w.posBufferUpto++
   341  	w.lastPosition = position
   342  	if w.posBufferUpto == LUCENE41_BLOCK_SIZE {
   343  		var err error
   344  		if err = w.forUtil.writeBlock(w.posDeltaBuffer, w.encoded, w.posOut); err != nil {
   345  			return err
   346  		}
   347  
   348  		if w.fieldHasPayloads {
   349  			panic("niy")
   350  		}
   351  		if w.fieldHasOffsets {
   352  			panic("niy")
   353  		}
   354  		w.posBufferUpto = 0
   355  	}
   356  	return nil
   357  }
   358  
   359  func (w *Lucene41PostingsWriter) FinishDoc() error {
   360  	// since we don't know df for current term, we had to buffer those
   361  	// skip data for each block, and when a new doc comes, write them
   362  	// to skip file.
   363  	if w.docBufferUpto == LUCENE41_BLOCK_SIZE {
   364  		w.lastBlockDocId = w.lastDocId
   365  		if w.posOut != nil {
   366  			if w.payOut != nil {
   367  				w.lastBlockPayFP = w.payOut.FilePointer()
   368  			}
   369  			w.lastBlockPosFP = w.posOut.FilePointer()
   370  			w.lastBlockPosBufferUpto = w.posBufferUpto
   371  			w.lastBlockPayloadByteUpto = w.payloadByteUpto
   372  		}
   373  		w.docBufferUpto = 0
   374  	}
   375  	return nil
   376  }
   377  
   378  /* Called when we are done adding docs to this term */
   379  func (w *Lucene41PostingsWriter) FinishTerm(_state *BlockTermState) error {
   380  	state := _state.Self.(*intBlockTermState)
   381  	assert(state.DocFreq > 0)
   382  
   383  	// TODO: wasteful we are couting this (counting # docs for this term) in two places?
   384  	assert2(state.DocFreq == w.docCount, "%v vs %v", state.DocFreq, w.docCount)
   385  
   386  	// docFreq == 1, don't write the single docId/freq to a separate
   387  	// file along with a pointer to it.
   388  	var singletonDocId int
   389  	if state.DocFreq == 1 {
   390  		// pulse the singleton docId into the term dictionary, freq is implicitly totalTermFreq
   391  		singletonDocId = w.docDeltaBuffer[0]
   392  	} else {
   393  		singletonDocId = -1
   394  		// vInt encode the remaining doc dealtas and freqs;
   395  		var err error
   396  		for i := 0; i < w.docBufferUpto; i++ {
   397  			docDelta := w.docDeltaBuffer[i]
   398  			freq := w.freqBuffer[i]
   399  			if !w.fieldHasFreqs {
   400  				if err = w.docOut.WriteVInt(int32(docDelta)); err != nil {
   401  					return err
   402  				}
   403  			} else if w.freqBuffer[i] == 1 {
   404  				if err = w.docOut.WriteVInt(int32((docDelta << 1) | 1)); err != nil {
   405  					return err
   406  				}
   407  			} else {
   408  				if err = w.docOut.WriteVInt(int32(docDelta << 1)); err != nil {
   409  					return err
   410  				}
   411  				if err = w.docOut.WriteVInt(int32(freq)); err != nil {
   412  					return err
   413  				}
   414  			}
   415  		}
   416  	}
   417  
   418  	var lastPosBlockOffset int64
   419  	if w.fieldHasPositions {
   420  		// totalTermFreq is just total number of positions (or payloads,
   421  		// or offsets) associated with current term.
   422  		assert(state.TotalTermFreq != -1)
   423  		if state.TotalTermFreq > LUCENE41_BLOCK_SIZE {
   424  			// record file offset for last pos in last block
   425  			lastPosBlockOffset = w.posOut.FilePointer() - w.posStartFP
   426  		} else {
   427  			lastPosBlockOffset = -1
   428  		}
   429  		if w.posBufferUpto > 0 {
   430  			// TODO: should we send offsets/payloads to .pay...? seems
   431  			// wasteful (have to store extra vlong for low (< BLOCK_SIZE)
   432  			// DF terms = vast vast majority)
   433  
   434  			// vInt encode the remaining positions/payloads/offsets:
   435  			// lastPayloadLength := -1 // force first payload length to be written
   436  			// lastOffsetLength := -1  // force first offset length to be written
   437  			payloadBytesReadUpto := 0
   438  			for i := 0; i < w.posBufferUpto; i++ {
   439  				posDelta := w.posDeltaBuffer[i]
   440  				if w.fieldHasPayloads {
   441  					panic("not implemented yet")
   442  				} else {
   443  					err := w.posOut.WriteVInt(int32(posDelta))
   444  					if err != nil {
   445  						return err
   446  					}
   447  				}
   448  
   449  				if w.fieldHasOffsets {
   450  					panic("not implemented yet")
   451  				}
   452  			}
   453  
   454  			if w.fieldHasPayloads {
   455  				assert(payloadBytesReadUpto == w.payloadByteUpto)
   456  				w.payloadByteUpto = 0
   457  			}
   458  		}
   459  	} else {
   460  		lastPosBlockOffset = -1
   461  	}
   462  
   463  	var skipOffset int64
   464  	if w.docCount > LUCENE41_BLOCK_SIZE {
   465  		n, err := w.skipWriter.WriteSkip(w.docOut)
   466  		if err != nil {
   467  			return err
   468  		}
   469  		skipOffset = n - w.docStartFP
   470  	} else {
   471  		skipOffset = -1
   472  	}
   473  
   474  	state.docStartFP = w.docStartFP
   475  	state.posStartFP = w.posStartFP
   476  	state.payStartFP = w.payStartFP
   477  	state.singletonDocID = singletonDocId
   478  	state.skipOffset = skipOffset
   479  	state.lastPosBlockOffset = lastPosBlockOffset
   480  	w.docBufferUpto = 0
   481  	w.posBufferUpto = 0
   482  	w.lastDocId = 0
   483  	w.docCount = 0
   484  	return nil
   485  }
   486  
   487  func (w *Lucene41PostingsWriter) EncodeTerm(longs []int64,
   488  	out util.DataOutput, fieldInfo *FieldInfo, _state *BlockTermState,
   489  	absolute bool) (err error) {
   490  
   491  	assert(longs != nil)
   492  	assert(len(longs) > 0)
   493  	state := _state.Self.(*intBlockTermState)
   494  	if absolute {
   495  		w.lastState = emptyState
   496  	}
   497  	longs[0] = state.docStartFP - w.lastState.docStartFP
   498  	if w.fieldHasPositions {
   499  		longs[1] = state.posStartFP - w.lastState.posStartFP
   500  		if w.fieldHasPayloads || w.fieldHasOffsets {
   501  			longs[2] = state.payStartFP - w.lastState.payStartFP
   502  		}
   503  	}
   504  	if state.singletonDocID != -1 {
   505  		if err = out.WriteVInt(int32(state.singletonDocID)); err != nil {
   506  			return
   507  		}
   508  	}
   509  	if w.fieldHasPositions && state.lastPosBlockOffset != -1 {
   510  		if err = out.WriteVLong(state.lastPosBlockOffset); err != nil {
   511  			return
   512  		}
   513  	}
   514  	if state.skipOffset != -1 {
   515  		if err = out.WriteVLong(state.skipOffset); err != nil {
   516  			return
   517  		}
   518  	}
   519  	w.lastState = state
   520  	return nil
   521  }
   522  
   523  func (w *Lucene41PostingsWriter) Close() (err error) {
   524  	var success = false
   525  	defer func() {
   526  		if success {
   527  			err = util.Close(w.docOut, w.posOut, w.payOut)
   528  		} else {
   529  			util.CloseWhileSuppressingError(w.docOut, w.posOut, w.payOut)
   530  		}
   531  		w.docOut = nil
   532  		w.posOut = nil
   533  		w.payOut = nil
   534  	}()
   535  
   536  	if err == nil && w.docOut != nil {
   537  		err = codec.WriteFooter(w.docOut)
   538  	}
   539  	if err == nil && w.posOut != nil {
   540  		err = codec.WriteFooter(w.posOut)
   541  	}
   542  	if err == nil && w.payOut != nil {
   543  		err = codec.WriteFooter(w.payOut)
   544  	}
   545  	if err != nil {
   546  		return
   547  	}
   548  	success = true
   549  	return nil
   550  }