github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/invertedDocConsumerPerField.go

github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/invertedDocConsumerPerField.go (about)

     1  package index
     2  
     3  import (
     4  	ta "github.com/balzaczyy/golucene/core/analysis/tokenattributes"
     5  	. "github.com/balzaczyy/golucene/core/index/model"
     6  	"github.com/balzaczyy/golucene/core/util"
     7  )
     8  
     9  // index/InvertedDocConsumerPerField.java
    10  
    11  // type InvertedDocConsumerPerField interface {
    12  // 	// Called once per field, and is given all IndexableField
    13  // 	// occurrences for this field in the document. Return true if you
    14  // 	// wish to see inverted tokens for these fields:
    15  // 	start([]IndexableField, int) (bool, error)
    16  // 	// Called before a field instance is being processed
    17  // 	startField(IndexableField)
    18  // 	// Called once per inverted token
    19  // 	add() error
    20  // 	// Called once per field per document, after all IndexableFields
    21  // 	// are inverted
    22  // 	finish() error
    23  // 	// Called on hitting an aborting error
    24  // 	abort()
    25  // }
    26  
    27  const HASH_INIT_SIZE = 4
    28  
    29  type TermsHashPerField interface {
    30  	next() TermsHashPerField
    31  	reset()
    32  	addFrom(int) error
    33  	add() error
    34  	finish() error
    35  	start(IndexableField, bool) bool
    36  }
    37  
    38  type TermsHashPerFieldSPI interface {
    39  	// Called when a term is seen for the first time.
    40  	newTerm(int)
    41  	// Called when a previously seen term is seen again.
    42  	addTerm(int)
    43  	// Called when postings array is initialized or resized.
    44  	newPostingsArray()
    45  	// Creates a new postings array of the specified size.
    46  	createPostingsArray(int) *ParallelPostingsArray
    47  }
    48  
    49  type TermsHashPerFieldImpl struct {
    50  	spi TermsHashPerFieldSPI
    51  
    52  	termsHash TermsHash
    53  
    54  	nextPerField TermsHashPerField
    55  	docState     *docState
    56  	fieldState   *FieldInvertState
    57  	termAtt      ta.TermToBytesRefAttribute
    58  	termBytesRef *util.BytesRef
    59  
    60  	// Copied from our perThread
    61  	intPool      *util.IntBlockPool
    62  	bytePool     *util.ByteBlockPool
    63  	termBytePool *util.ByteBlockPool
    64  
    65  	streamCount   int
    66  	numPostingInt int
    67  
    68  	fieldInfo *FieldInfo
    69  
    70  	bytesHash *util.BytesRefHash
    71  
    72  	postingsArray *ParallelPostingsArray
    73  	bytesUsed     util.Counter
    74  
    75  	doNextCall bool
    76  
    77  	intUptos     []int
    78  	intUptoStart int
    79  }
    80  
    81  /*
    82  streamCount: how many streams this field stores per term. E.g.
    83  doc(+freq) is 1 stream, prox+offset is a second.
    84  
    85  NOTE: due to Go's embedded inheritance, it has to be invoked after it
    86  is initialized and embedded by child class.
    87  */
    88  func (h *TermsHashPerFieldImpl) _constructor(spi TermsHashPerFieldSPI,
    89  	streamCount int, fieldState *FieldInvertState,
    90  	termsHash TermsHash, nextPerField TermsHashPerField,
    91  	fieldInfo *FieldInfo) {
    92  
    93  	termsHashImpl := termsHash.fields()
    94  
    95  	h.spi = spi
    96  	h.intPool = termsHashImpl.intPool
    97  	h.bytePool = termsHashImpl.bytePool
    98  	h.termBytePool = termsHashImpl.termBytePool
    99  	h.docState = termsHashImpl.docState
   100  	h.termsHash = termsHash
   101  	h.bytesUsed = termsHashImpl.bytesUsed
   102  	h.fieldState = fieldState
   103  	h.streamCount = streamCount
   104  	h.numPostingInt = 2 * streamCount
   105  	h.fieldInfo = fieldInfo
   106  	h.nextPerField = nextPerField
   107  	byteStarts := newPostingsBytesStartArray(h, h.bytesUsed)
   108  	h.bytesHash = util.NewBytesRefHash(termsHashImpl.termBytePool, HASH_INIT_SIZE, byteStarts)
   109  }
   110  
   111  func (h *TermsHashPerFieldImpl) next() TermsHashPerField {
   112  	return h.nextPerField
   113  }
   114  
   115  func (h *TermsHashPerFieldImpl) reset() {
   116  	h.bytesHash.Clear(false)
   117  	if h.nextPerField != nil {
   118  		h.nextPerField.reset()
   119  	}
   120  }
   121  
   122  // func (h *TermsHashPerField) abort() {
   123  // 	h.reset()
   124  // 	if h.nextPerField != nil {
   125  // 		h.nextPerField.abort()
   126  // 	}
   127  // }
   128  
   129  func (h *TermsHashPerFieldImpl) initReader(reader *ByteSliceReader, termId, stream int) {
   130  	assert(stream < h.streamCount)
   131  	intStart := h.postingsArray.intStarts[termId]
   132  	ints := h.intPool.Buffers[intStart>>util.INT_BLOCK_SHIFT]
   133  	upto := intStart & util.INT_BLOCK_MASK
   134  	reader.init(h.bytePool,
   135  		h.postingsArray.byteStarts[termId]+stream*util.FIRST_LEVEL_SIZE,
   136  		ints[upto+stream])
   137  }
   138  
   139  /* Collapse the hash table & sort in-place; also sets sortedTermIDs to the results */
   140  func (h *TermsHashPerFieldImpl) sortPostings(termComp func(a, b []byte) bool) []int {
   141  	return h.bytesHash.Sort(termComp)
   142  }
   143  
   144  // func (h *TermsHashPerField) startField(f IndexableField) {
   145  // 	h.termAtt = h.fieldState.attributeSource.Get("TermToBytesRefAttribute").(ta.TermToBytesRefAttribute)
   146  // 	h.termBytesRef = h.termAtt.BytesRef()
   147  // 	assert(h.termBytesRef != nil)
   148  // 	h.consumer.startField(f)
   149  // 	if h.nextPerField != nil {
   150  // 		h.nextPerField.startField(f)
   151  // 	}
   152  // }
   153  
   154  // func (h *TermsHashPerField) start(fields []IndexableField, count int) (bool, error) {
   155  // 	var err error
   156  // 	h.doCall, err = h.consumer.start(fields, count)
   157  // 	if err != nil {
   158  // 		return false, err
   159  // 	}
   160  // 	h.bytesHash.Reinit()
   161  // 	if h.nextPerField != nil {
   162  // 		h.doNextCall, err = h.nextPerField.start(fields, count)
   163  // 		if err != nil {
   164  // 			return false, err
   165  // 		}
   166  // 	}
   167  // 	return h.doCall || h.doNextCall, nil
   168  // }
   169  
   170  /*
   171  Secondary entry point (for 2nd & subsequent TermsHash), because token
   172  text has already be "interned" into textStart, so we hash by textStart
   173  */
   174  func (h *TermsHashPerFieldImpl) addFrom(textStart int) error {
   175  	panic("not implemented yet")
   176  }
   177  
   178  // Simpler version of Lucene's own method
   179  func utf8ToString(iso8859_1_buf []byte) string {
   180  	buf := make([]rune, len(iso8859_1_buf))
   181  	for i, b := range iso8859_1_buf {
   182  		buf[i] = rune(b)
   183  	}
   184  	return string(buf)
   185  }
   186  
   187  /*
   188  Called once per inverted token. This is the primary entry point (for
   189  first TermsHash); postings use this API.
   190  */
   191  func (h *TermsHashPerFieldImpl) add() (err error) {
   192  	h.termAtt.FillBytesRef()
   193  
   194  	// We are first in the chain so we must "intern" the term text into
   195  	// textStart address. Get the text & hash of this term.
   196  	var termId int
   197  	if termId, err = h.bytesHash.Add(h.termBytesRef.ToBytes()); err != nil {
   198  		return
   199  	}
   200  
   201  	// fmt.Printf("add term=%v doc=%v termId=%v\n",
   202  	// 	string(h.termBytesRef.Value), h.docState.docID, termId)
   203  
   204  	if termId >= 0 { // new posting
   205  		h.bytesHash.ByteStart(termId)
   206  		// init stream slices
   207  		if h.numPostingInt+h.intPool.IntUpto > util.INT_BLOCK_SIZE {
   208  			h.intPool.NextBuffer()
   209  		}
   210  
   211  		if util.BYTE_BLOCK_SIZE-h.bytePool.ByteUpto < h.numPostingInt*util.FIRST_LEVEL_SIZE {
   212  			h.bytePool.NextBuffer()
   213  		}
   214  
   215  		h.intUptos = h.intPool.Buffer
   216  		h.intUptoStart = h.intPool.IntUpto
   217  		h.intPool.IntUpto += h.streamCount
   218  
   219  		h.postingsArray.intStarts[termId] = h.intUptoStart + h.intPool.IntOffset
   220  
   221  		for i := 0; i < h.streamCount; i++ {
   222  			upto := h.bytePool.NewSlice(util.FIRST_LEVEL_SIZE)
   223  			h.intUptos[h.intUptoStart+i] = upto + h.bytePool.ByteOffset
   224  		}
   225  		h.postingsArray.byteStarts[termId] = h.intUptos[h.intUptoStart]
   226  
   227  		h.spi.newTerm(termId)
   228  
   229  	} else {
   230  		termId = (-termId) - 1
   231  		intStart := h.postingsArray.intStarts[termId]
   232  		h.intUptos = h.intPool.Buffers[intStart>>util.INT_BLOCK_SHIFT]
   233  		h.intUptoStart = intStart & util.INT_BLOCK_MASK
   234  		h.spi.addTerm(termId)
   235  	}
   236  
   237  	if h.doNextCall {
   238  		return h.nextPerField.addFrom(h.postingsArray.textStarts[termId])
   239  	}
   240  	return nil
   241  }
   242  
   243  func (h *TermsHashPerFieldImpl) writeByte(stream int, b byte) {
   244  	upto := h.intUptos[h.intUptoStart+stream]
   245  	bytes := h.bytePool.Buffers[upto>>util.BYTE_BLOCK_SHIFT]
   246  	assert(bytes != nil)
   247  	offset := upto & util.BYTE_BLOCK_MASK
   248  	if bytes[offset] != 0 {
   249  		// end of slice; allocate a new one
   250  		offset = h.bytePool.AllocSlice(bytes, offset)
   251  		bytes = h.bytePool.Buffer
   252  		h.intUptos[h.intUptoStart+stream] = offset + h.bytePool.ByteOffset
   253  	}
   254  	bytes[offset] = b
   255  	h.intUptos[h.intUptoStart+stream]++
   256  }
   257  
   258  func (h *TermsHashPerFieldImpl) writeVInt(stream, i int) {
   259  	assert(stream < h.streamCount)
   260  	for (i & ^0x7F) != 0 {
   261  		h.writeByte(stream, byte((i&0x7F)|0x80))
   262  		i = int(uint(i) >> 7)
   263  	}
   264  	h.writeByte(stream, byte(i))
   265  }
   266  
   267  func (h *TermsHashPerFieldImpl) finish() error {
   268  	if h.nextPerField != nil {
   269  		return h.nextPerField.finish()
   270  	}
   271  	return nil
   272  }
   273  
   274  /*
   275  Start adding a new field instance; first is true if this is the first
   276  time this field name was seen in the document.
   277  */
   278  func (h *TermsHashPerFieldImpl) start(field IndexableField, first bool) bool {
   279  	if h.termAtt = h.fieldState.termAttribute; h.termAtt != nil {
   280  		// EmptyTokenStream can have nil term att
   281  		h.termBytesRef = h.termAtt.BytesRef()
   282  	}
   283  	if h.nextPerField != nil {
   284  		h.doNextCall = h.nextPerField.start(field, first)
   285  	}
   286  	return true
   287  }
   288  
   289  type PostingsBytesStartArray struct {
   290  	perField  *TermsHashPerFieldImpl
   291  	bytesUsed util.Counter
   292  }
   293  
   294  func newPostingsBytesStartArray(perField *TermsHashPerFieldImpl,
   295  	bytesUsed util.Counter) *PostingsBytesStartArray {
   296  	return &PostingsBytesStartArray{perField, bytesUsed}
   297  }
   298  
   299  func (ss *PostingsBytesStartArray) Init() []int {
   300  	if ss.perField.postingsArray == nil {
   301  		arr := ss.perField.spi.createPostingsArray(2)
   302  		ss.perField.postingsArray = arr
   303  		ss.perField.spi.newPostingsArray()
   304  		ss.bytesUsed.AddAndGet(int64(arr.size * arr.bytesPerPosting()))
   305  	}
   306  	return ss.perField.postingsArray.textStarts
   307  }
   308  
   309  func (ss *PostingsBytesStartArray) Grow() []int {
   310  	postingsArray := ss.perField.postingsArray
   311  	oldSize := postingsArray.size
   312  	postingsArray = postingsArray.grow()
   313  	ss.perField.postingsArray = postingsArray
   314  	ss.perField.spi.newPostingsArray()
   315  	ss.bytesUsed.AddAndGet(int64(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)))
   316  	return postingsArray.textStarts
   317  }
   318  
   319  func (ss *PostingsBytesStartArray) Clear() []int {
   320  	if arr := ss.perField.postingsArray; arr != nil {
   321  		ss.bytesUsed.AddAndGet(-int64(arr.size * arr.bytesPerPosting()))
   322  		ss.perField.postingsArray = nil
   323  		ss.perField.spi.newPostingsArray()
   324  	}
   325  	return nil
   326  }
   327  
   328  func (ss *PostingsBytesStartArray) BytesUsed() util.Counter {
   329  	return ss.bytesUsed
   330  }
   331  
   332  // index/ParallelPostingsArray.java
   333  
   334  const BYTES_PER_POSTING = 3 * util.NUM_BYTES_INT
   335  
   336  type PostingsArray interface {
   337  	bytesPerPosting() int
   338  	newInstance(size int) PostingsArray
   339  	copyTo(toArray PostingsArray, numToCopy int)
   340  }
   341  
   342  type ParallelPostingsArray struct {
   343  	PostingsArray
   344  	size       int
   345  	textStarts []int
   346  	intStarts  []int
   347  	byteStarts []int
   348  }
   349  
   350  func newParallelPostingsArray(spi PostingsArray, size int) *ParallelPostingsArray {
   351  	return &ParallelPostingsArray{
   352  		PostingsArray: spi,
   353  		size:          size,
   354  		textStarts:    make([]int, size),
   355  		intStarts:     make([]int, size),
   356  		byteStarts:    make([]int, size),
   357  	}
   358  }
   359  
   360  func (arr *ParallelPostingsArray) bytesPerPosting() int {
   361  	return BYTES_PER_POSTING
   362  }
   363  
   364  func (arr *ParallelPostingsArray) newInstance(size int) PostingsArray { // *ParallelPostingsArray
   365  	ans := newParallelPostingsArray(nil, size)
   366  	ans.PostingsArray = ans
   367  	return ans
   368  }
   369  
   370  func (arr *ParallelPostingsArray) grow() *ParallelPostingsArray {
   371  	newSize := util.Oversize(arr.size+1, arr.PostingsArray.bytesPerPosting())
   372  	newArray := arr.PostingsArray.newInstance(newSize)
   373  	arr.PostingsArray.copyTo(newArray, arr.size)
   374  	return newArray.(*ParallelPostingsArray)
   375  }
   376  
   377  func (arr *ParallelPostingsArray) copyTo(toArray PostingsArray, numToCopy int) {
   378  	to := toArray.(*ParallelPostingsArray)
   379  	copy(to.textStarts[:numToCopy], arr.textStarts[:numToCopy])
   380  	copy(to.intStarts[:numToCopy], arr.intStarts[:numToCopy])
   381  	copy(to.byteStarts[:numToCopy], arr.byteStarts[:numToCopy])
   382  }