github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene41/postingsReader.go (about)

     1  package lucene41
     2  
     3  import (
     4  	"fmt"
     5  	"github.com/balzaczyy/golucene/core/codec"
     6  	. "github.com/balzaczyy/golucene/core/codec/spi"
     7  	. "github.com/balzaczyy/golucene/core/index/model"
     8  	. "github.com/balzaczyy/golucene/core/search/model"
     9  	"github.com/balzaczyy/golucene/core/store"
    10  	"github.com/balzaczyy/golucene/core/util"
    11  )
    12  
    13  // Lucene41PostingsReader.java
    14  
    15  /*
    16  Concrete class that reads docId (maybe frq,pos,offset,payload) list
    17  with postings format.
    18  */
    19  type Lucene41PostingsReader struct {
    20  	docIn   store.IndexInput
    21  	posIn   store.IndexInput
    22  	payIn   store.IndexInput
    23  	forUtil *ForUtil
    24  	version int
    25  }
    26  
    27  func NewLucene41PostingsReader(dir store.Directory,
    28  	fis FieldInfos, si *SegmentInfo,
    29  	ctx store.IOContext, segmentSuffix string) (r PostingsReaderBase, err error) {
    30  
    31  	// fmt.Println("Initializing Lucene41PostingsReader...")
    32  	success := false
    33  	var docIn, posIn, payIn store.IndexInput = nil, nil, nil
    34  	defer func() {
    35  		if !success {
    36  			fmt.Println("Failed to initialize Lucene41PostingsReader.")
    37  			util.CloseWhileSuppressingError(docIn, posIn, payIn)
    38  		}
    39  	}()
    40  
    41  	docIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_DOC_EXTENSION), ctx)
    42  	if err != nil {
    43  		return nil, err
    44  	}
    45  	var version int32
    46  	version, err = codec.CheckHeader(docIn, LUCENE41_DOC_CODEC, LUCENE41_VERSION_START, LUCENE41_VERSION_CURRENT)
    47  	if err != nil {
    48  		return nil, err
    49  	}
    50  	forUtil, err := NewForUtilFrom(docIn)
    51  	if err != nil {
    52  		return nil, err
    53  	}
    54  
    55  	if version >= LUCENE41_VERSION_CHECKSUM {
    56  		// NOTE: data file is too costly to verify checksum against all the
    57  		// bytes on open, but for now we at least verify proper structure
    58  		// of the checksum footer: which looks for FOOTER_MAGIC +
    59  		// algorithmID. This is cheap and can detect some forms of
    60  		// corruption such as file trucation.
    61  		if _, err = codec.RetrieveChecksum(docIn); err != nil {
    62  			return nil, err
    63  		}
    64  	}
    65  
    66  	if fis.HasProx {
    67  		posIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_POS_EXTENSION), ctx)
    68  		if err != nil {
    69  			return nil, err
    70  		}
    71  		_, err = codec.CheckHeader(posIn, LUCENE41_POS_CODEC, version, version)
    72  		if err != nil {
    73  			return nil, err
    74  		}
    75  
    76  		if version >= LUCENE41_VERSION_CHECKSUM {
    77  			// NOTE: data file is too costly to verify checksum against all the
    78  			// bytes on open, but for now we at least verify proper structure
    79  			// of the checksum footer: which looks for FOOTER_MAGIC +
    80  			// algorithmID. This is cheap and can detect some forms of
    81  			// corruption such as file trucation.
    82  			if _, err = codec.RetrieveChecksum(posIn); err != nil {
    83  				return nil, err
    84  			}
    85  		}
    86  
    87  		if fis.HasPayloads || fis.HasOffsets {
    88  			payIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_PAY_EXTENSION), ctx)
    89  			if err != nil {
    90  				return nil, err
    91  			}
    92  			_, err = codec.CheckHeader(payIn, LUCENE41_PAY_CODEC, version, version)
    93  			if err != nil {
    94  				return nil, err
    95  			}
    96  
    97  			if version >= LUCENE41_VERSION_CHECKSUM {
    98  				// NOTE: data file is too costly to verify checksum against all the
    99  				// bytes on open, but for now we at least verify proper structure
   100  				// of the checksum footer: which looks for FOOTER_MAGIC +
   101  				// algorithmID. This is cheap and can detect some forms of
   102  				// corruption such as file trucation.
   103  				if _, err = codec.RetrieveChecksum(payIn); err != nil {
   104  					return nil, err
   105  				}
   106  
   107  			}
   108  		}
   109  	}
   110  
   111  	success = true
   112  	return &Lucene41PostingsReader{docIn, posIn, payIn, forUtil, int(version)}, nil
   113  }
   114  
   115  func (r *Lucene41PostingsReader) Init(termsIn store.IndexInput) error {
   116  	// fmt.Println("Initializing from:", termsIn)
   117  	// Make sure we are talking to the matching postings writer
   118  	_, err := codec.CheckHeader(termsIn, LUCENE41_TERMS_CODEC, LUCENE41_VERSION_START, LUCENE41_VERSION_CURRENT)
   119  	if err != nil {
   120  		return err
   121  	}
   122  	indexBlockSize, err := termsIn.ReadVInt()
   123  	if err != nil {
   124  		return err
   125  	}
   126  	// fmt.Println("Index block size:", indexBlockSize)
   127  	if indexBlockSize != LUCENE41_BLOCK_SIZE {
   128  		panic(fmt.Sprintf("index-time BLOCK_SIZE (%v) != read-time BLOCK_SIZE (%v)", indexBlockSize, LUCENE41_BLOCK_SIZE))
   129  	}
   130  	return nil
   131  }
   132  
   133  /**
   134   * Read values that have been written using variable-length encoding instead of bit-packing.
   135   */
   136  func readVIntBlock(docIn store.IndexInput, docBuffer []int,
   137  	freqBuffer []int, num int, indexHasFreq bool) (err error) {
   138  	if indexHasFreq {
   139  		for i := 0; i < num; i++ {
   140  			code, err := asInt(docIn.ReadVInt())
   141  			if err != nil {
   142  				return err
   143  			}
   144  			docBuffer[i] = int(uint(code) >> 1)
   145  			if (code & 1) != 0 {
   146  				freqBuffer[i] = 1
   147  			} else {
   148  				freqBuffer[i], err = asInt(docIn.ReadVInt())
   149  				if err != nil {
   150  					return err
   151  				}
   152  			}
   153  		}
   154  	} else {
   155  		for i := 0; i < num; i++ {
   156  			docBuffer[i], err = asInt(docIn.ReadVInt())
   157  			if err != nil {
   158  				return err
   159  			}
   160  		}
   161  	}
   162  	return nil
   163  }
   164  
   165  func asInt(n int32, err error) (int, error) {
   166  	return int(n), err
   167  }
   168  
   169  func (r *Lucene41PostingsReader) NewTermState() *BlockTermState {
   170  	return newIntBlockTermState().BlockTermState
   171  }
   172  
   173  func (r *Lucene41PostingsReader) Close() error {
   174  	return util.Close(r.docIn, r.posIn, r.payIn)
   175  }
   176  
   177  func (r *Lucene41PostingsReader) DecodeTerm(longs []int64,
   178  	in util.DataInput, fieldInfo *FieldInfo,
   179  	_termState *BlockTermState, absolute bool) (err error) {
   180  
   181  	termState := _termState.Self.(*intBlockTermState)
   182  	fieldHasPositions := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS
   183  	fieldHasOffsets := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
   184  	fieldHasPayloads := fieldInfo.HasPayloads()
   185  
   186  	if absolute {
   187  		termState.docStartFP = 0
   188  		termState.posStartFP = 0
   189  		termState.payStartFP = 0
   190  	}
   191  	if r.version < LUCENE41_VERSION_META_ARRAY { // backward compatibility
   192  		return r._decodeTerm(in, fieldInfo, termState)
   193  	}
   194  	termState.docStartFP += longs[0]
   195  	if fieldHasPositions {
   196  		termState.posStartFP += longs[1]
   197  		if fieldHasOffsets || fieldHasPayloads {
   198  			termState.payStartFP += longs[2]
   199  		}
   200  	}
   201  	if termState.DocFreq == 1 {
   202  		if termState.singletonDocID, err = asInt(in.ReadVInt()); err != nil {
   203  			return
   204  		}
   205  	} else {
   206  		termState.singletonDocID = -1
   207  	}
   208  	if fieldHasPositions {
   209  		if termState.TotalTermFreq > LUCENE41_BLOCK_SIZE {
   210  			if termState.lastPosBlockOffset, err = in.ReadVLong(); err != nil {
   211  				return err
   212  			}
   213  		} else {
   214  			termState.lastPosBlockOffset = -1
   215  		}
   216  	}
   217  	if termState.DocFreq > LUCENE41_BLOCK_SIZE {
   218  		if termState.skipOffset, err = in.ReadVLong(); err != nil {
   219  			return
   220  		}
   221  	} else {
   222  		termState.skipOffset = -1
   223  	}
   224  	return nil
   225  }
   226  
   227  func (r *Lucene41PostingsReader) _decodeTerm(in util.DataInput,
   228  	fieldInfo *FieldInfo, termState *intBlockTermState) (err error) {
   229  
   230  	fieldHasPositions := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS
   231  	fieldHasOffsets := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
   232  	fieldHasPaylods := fieldInfo.HasPayloads()
   233  	if termState.DocFreq == 1 {
   234  		if termState.singletonDocID, err = asInt(in.ReadVInt()); err != nil {
   235  			return
   236  		}
   237  	} else {
   238  		termState.singletonDocID = -1
   239  		var n int64
   240  		if n, err = in.ReadVLong(); err != nil {
   241  			return
   242  		}
   243  		termState.docStartFP += n
   244  	}
   245  	if fieldHasPositions {
   246  		var n int64
   247  		if n, err = in.ReadVLong(); err != nil {
   248  			return
   249  		}
   250  		termState.posStartFP += n
   251  		if termState.TotalTermFreq > LUCENE41_BLOCK_SIZE {
   252  			if n, err = in.ReadVLong(); err != nil {
   253  				return
   254  			}
   255  			termState.lastPosBlockOffset += n
   256  		} else {
   257  			termState.lastPosBlockOffset = -1
   258  		}
   259  		if (fieldHasPaylods || fieldHasOffsets) && termState.TotalTermFreq >= LUCENE41_BLOCK_SIZE {
   260  			if n, err = in.ReadVLong(); err != nil {
   261  				return
   262  			}
   263  			termState.payStartFP += n
   264  		}
   265  	}
   266  	if termState.DocFreq > LUCENE41_BLOCK_SIZE {
   267  		if termState.skipOffset, err = in.ReadVLong(); err != nil {
   268  			return
   269  		}
   270  	} else {
   271  		termState.skipOffset = -1
   272  	}
   273  	return nil
   274  }
   275  
   276  func (r *Lucene41PostingsReader) Docs(fieldInfo *FieldInfo,
   277  	termState *BlockTermState, liveDocs util.Bits,
   278  	reuse DocsEnum, flags int) (de DocsEnum, err error) {
   279  
   280  	var docsEnum *blockDocsEnum
   281  	if v, ok := reuse.(*blockDocsEnum); ok {
   282  		docsEnum = v
   283  		if !docsEnum.canReuse(r.docIn, fieldInfo) {
   284  			docsEnum = newBlockDocsEnum(r, fieldInfo)
   285  		}
   286  	} else {
   287  		docsEnum = newBlockDocsEnum(r, fieldInfo)
   288  	}
   289  	return docsEnum.reset(liveDocs, termState.Self.(*intBlockTermState), flags)
   290  }
   291  
   292  type blockDocsEnum struct {
   293  	*Lucene41PostingsReader // embedded struct
   294  
   295  	encoded []byte
   296  
   297  	docDeltaBuffer []int
   298  	freqBuffer     []int
   299  
   300  	docBufferUpto int
   301  
   302  	// skipper Lucene41SkipReader
   303  	skipped bool
   304  
   305  	startDocIn store.IndexInput
   306  
   307  	docIn            store.IndexInput
   308  	indexHasFreq     bool
   309  	indexHasPos      bool
   310  	indexHasOffsets  bool
   311  	indexHasPayloads bool
   312  
   313  	docFreq       int
   314  	totalTermFreq int64
   315  	docUpto       int
   316  	doc           int
   317  	accum         int
   318  	freq          int
   319  
   320  	// Where this term's postings start in the .doc file:
   321  	docTermStartFP int64
   322  
   323  	// Where this term's skip data starts (after
   324  	// docTermStartFP) in the .doc file (or -1 if there is
   325  	// no skip data for this term):
   326  	skipOffset int64
   327  
   328  	// docID for next skip point, we won't use skipper if
   329  	// target docID is not larger than this
   330  	nextSkipDoc int
   331  
   332  	liveDocs util.Bits
   333  
   334  	needsFreq      bool
   335  	singletonDocID int
   336  }
   337  
   338  func newBlockDocsEnum(owner *Lucene41PostingsReader,
   339  	fieldInfo *FieldInfo) *blockDocsEnum {
   340  
   341  	return &blockDocsEnum{
   342  		Lucene41PostingsReader: owner,
   343  		docDeltaBuffer:         make([]int, MAX_DATA_SIZE),
   344  		freqBuffer:             make([]int, MAX_DATA_SIZE),
   345  		startDocIn:             owner.docIn,
   346  		docIn:                  nil,
   347  		indexHasFreq:           fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS,
   348  		indexHasPos:            fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS,
   349  		indexHasOffsets:        fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS,
   350  		indexHasPayloads:       fieldInfo.HasPayloads(),
   351  		encoded:                make([]byte, MAX_ENCODED_SIZE),
   352  	}
   353  }
   354  
   355  func (de *blockDocsEnum) canReuse(docIn store.IndexInput, fieldInfo *FieldInfo) bool {
   356  	return docIn == de.startDocIn &&
   357  		de.indexHasFreq == (fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS) &&
   358  		de.indexHasPos == (fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS) &&
   359  		de.indexHasPayloads == fieldInfo.HasPayloads()
   360  }
   361  
   362  func (de *blockDocsEnum) reset(liveDocs util.Bits, termState *intBlockTermState, flags int) (ret DocsEnum, err error) {
   363  	de.liveDocs = liveDocs
   364  	// fmt.Println("  FPR.reset: termState=", termState)
   365  	de.docFreq = termState.DocFreq
   366  	if de.indexHasFreq {
   367  		de.totalTermFreq = termState.TotalTermFreq
   368  	} else {
   369  		de.totalTermFreq = int64(de.docFreq)
   370  	}
   371  	de.docTermStartFP = termState.docStartFP // <---- docTermStartFP should be 178 instead of 0
   372  	de.skipOffset = termState.skipOffset
   373  	de.singletonDocID = termState.singletonDocID
   374  	if de.docFreq > 1 {
   375  		if de.docIn == nil {
   376  			// lazy init
   377  			de.docIn = de.startDocIn.Clone()
   378  		}
   379  		err = de.docIn.Seek(de.docTermStartFP)
   380  		if err != nil {
   381  			return nil, err
   382  		}
   383  	}
   384  
   385  	de.doc = -1
   386  	de.needsFreq = (flags & DOCS_ENUM_FLAG_FREQS) != 0
   387  	if !de.indexHasFreq {
   388  		for i, _ := range de.freqBuffer {
   389  			de.freqBuffer[i] = 1
   390  		}
   391  	}
   392  	de.accum = 0
   393  	de.docUpto = 0
   394  	de.nextSkipDoc = LUCENE41_BLOCK_SIZE - 1 // we won't skip if target is found in first block
   395  	de.docBufferUpto = LUCENE41_BLOCK_SIZE
   396  	de.skipped = false
   397  	return de, nil
   398  }
   399  
   400  func (de *blockDocsEnum) Freq() (n int, err error) {
   401  	return de.freq, nil
   402  }
   403  
   404  func (de *blockDocsEnum) DocId() int {
   405  	return de.doc
   406  }
   407  
   408  func (de *blockDocsEnum) refillDocs() (err error) {
   409  	left := de.docFreq - de.docUpto
   410  	assert(left > 0)
   411  
   412  	if left >= LUCENE41_BLOCK_SIZE {
   413  		fmt.Println("    fill doc block from fp=", de.docIn.FilePointer())
   414  		panic("not implemented yet")
   415  	} else if de.docFreq == 1 {
   416  		de.docDeltaBuffer[0] = de.singletonDocID
   417  		de.freqBuffer[0] = int(de.totalTermFreq)
   418  	} else {
   419  		// Read vInts:
   420  		// fmt.Println("    fill last vInt block from fp=", de.docIn.FilePointer())
   421  		err = readVIntBlock(de.docIn, de.docDeltaBuffer, de.freqBuffer, left, de.indexHasFreq)
   422  	}
   423  	de.docBufferUpto = 0
   424  	return
   425  }
   426  
   427  func (de *blockDocsEnum) NextDoc() (n int, err error) {
   428  	// fmt.Println("FPR.nextDoc")
   429  	for {
   430  		// fmt.Printf("  docUpto=%v (of df=%v) docBufferUpto=%v\n", de.docUpto, de.docFreq, de.docBufferUpto)
   431  
   432  		if de.docUpto == de.docFreq {
   433  			// fmt.Println("  return doc=END")
   434  			de.doc = NO_MORE_DOCS
   435  			return de.doc, nil
   436  		}
   437  
   438  		if de.docBufferUpto == LUCENE41_BLOCK_SIZE {
   439  			err = de.refillDocs()
   440  			if err != nil {
   441  				return 0, err
   442  			}
   443  		}
   444  
   445  		// fmt.Printf("    accum=%v docDeltaBuffer[%v]=%v\n", de.accum, de.docBufferUpto, de.docDeltaBuffer[de.docBufferUpto])
   446  		de.accum += de.docDeltaBuffer[de.docBufferUpto]
   447  		de.docUpto++
   448  
   449  		if de.liveDocs == nil || de.liveDocs.At(de.accum) {
   450  			de.doc = de.accum
   451  			de.freq = de.freqBuffer[de.docBufferUpto]
   452  			de.docBufferUpto++
   453  			// fmt.Printf("  return doc=%v freq=%v\n", de.doc, de.freq)
   454  			return de.doc, nil
   455  		}
   456  		// fmt.Printf("  doc=%v is deleted; try next doc\n", de.accum)
   457  		de.docBufferUpto++
   458  	}
   459  }
   460  
   461  func (de *blockDocsEnum) Advance(target int) (int, error) {
   462  	// TODO: make frq block load lazy/skippable
   463  	fmt.Printf("  FPR.advance target=%v\n", target)
   464  
   465  	// current skip docID < docIDs generated from current buffer <= next
   466  	// skip docID, we don't need to skip if target is buffered already
   467  	if de.docFreq > LUCENE41_BLOCK_SIZE && target > de.nextSkipDoc {
   468  		fmt.Println("load skipper")
   469  
   470  		panic("not implemented yet")
   471  	}
   472  	if de.docUpto == de.docFreq {
   473  		de.doc = NO_MORE_DOCS
   474  		return de.doc, nil
   475  	}
   476  	if de.docBufferUpto == LUCENE41_BLOCK_SIZE {
   477  		err := de.refillDocs()
   478  		if err != nil {
   479  			return 0, nil
   480  		}
   481  	}
   482  
   483  	// Now scan.. this is an inlined/pared down version of nextDoc():
   484  	for {
   485  		fmt.Printf("  scan doc=%v docBufferUpto=%v\n", de.accum, de.docBufferUpto)
   486  		de.accum += de.docDeltaBuffer[de.docBufferUpto]
   487  		de.docUpto++
   488  
   489  		if de.accum >= target {
   490  			break
   491  		}
   492  		de.docBufferUpto++
   493  		if de.docUpto == de.docFreq {
   494  			de.doc = NO_MORE_DOCS
   495  			return de.doc, nil
   496  		}
   497  	}
   498  
   499  	if de.liveDocs == nil || de.liveDocs.At(de.accum) {
   500  		fmt.Printf("  return doc=%v\n", de.accum)
   501  		de.freq = de.freqBuffer[de.docBufferUpto]
   502  		de.docBufferUpto++
   503  		de.doc = de.accum
   504  		return de.doc, nil
   505  	} else {
   506  		fmt.Println("  now do nextDoc()")
   507  		de.docBufferUpto++
   508  		return de.NextDoc()
   509  	}
   510  }