github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/index/invertedDocConsumerPerField.go (about) 1 package index 2 3 import ( 4 ta "github.com/balzaczyy/golucene/core/analysis/tokenattributes" 5 . "github.com/balzaczyy/golucene/core/index/model" 6 "github.com/balzaczyy/golucene/core/util" 7 ) 8 9 // index/InvertedDocConsumerPerField.java 10 11 // type InvertedDocConsumerPerField interface { 12 // // Called once per field, and is given all IndexableField 13 // // occurrences for this field in the document. Return true if you 14 // // wish to see inverted tokens for these fields: 15 // start([]IndexableField, int) (bool, error) 16 // // Called before a field instance is being processed 17 // startField(IndexableField) 18 // // Called once per inverted token 19 // add() error 20 // // Called once per field per document, after all IndexableFields 21 // // are inverted 22 // finish() error 23 // // Called on hitting an aborting error 24 // abort() 25 // } 26 27 const HASH_INIT_SIZE = 4 28 29 type TermsHashPerField interface { 30 next() TermsHashPerField 31 reset() 32 addFrom(int) error 33 add() error 34 finish() error 35 start(IndexableField, bool) bool 36 } 37 38 type TermsHashPerFieldSPI interface { 39 // Called when a term is seen for the first time. 40 newTerm(int) 41 // Called when a previously seen term is seen again. 42 addTerm(int) 43 // Called when postings array is initialized or resized. 44 newPostingsArray() 45 // Creates a new postings array of the specified size. 46 createPostingsArray(int) *ParallelPostingsArray 47 } 48 49 type TermsHashPerFieldImpl struct { 50 spi TermsHashPerFieldSPI 51 52 termsHash TermsHash 53 54 nextPerField TermsHashPerField 55 docState *docState 56 fieldState *FieldInvertState 57 termAtt ta.TermToBytesRefAttribute 58 termBytesRef *util.BytesRef 59 60 // Copied from our perThread 61 intPool *util.IntBlockPool 62 bytePool *util.ByteBlockPool 63 termBytePool *util.ByteBlockPool 64 65 streamCount int 66 numPostingInt int 67 68 fieldInfo *FieldInfo 69 70 bytesHash *util.BytesRefHash 71 72 postingsArray *ParallelPostingsArray 73 bytesUsed util.Counter 74 75 doNextCall bool 76 77 intUptos []int 78 intUptoStart int 79 } 80 81 /* 82 streamCount: how many streams this field stores per term. E.g. 83 doc(+freq) is 1 stream, prox+offset is a second. 84 85 NOTE: due to Go's embedded inheritance, it has to be invoked after it 86 is initialized and embedded by child class. 87 */ 88 func (h *TermsHashPerFieldImpl) _constructor(spi TermsHashPerFieldSPI, 89 streamCount int, fieldState *FieldInvertState, 90 termsHash TermsHash, nextPerField TermsHashPerField, 91 fieldInfo *FieldInfo) { 92 93 termsHashImpl := termsHash.fields() 94 95 h.spi = spi 96 h.intPool = termsHashImpl.intPool 97 h.bytePool = termsHashImpl.bytePool 98 h.termBytePool = termsHashImpl.termBytePool 99 h.docState = termsHashImpl.docState 100 h.termsHash = termsHash 101 h.bytesUsed = termsHashImpl.bytesUsed 102 h.fieldState = fieldState 103 h.streamCount = streamCount 104 h.numPostingInt = 2 * streamCount 105 h.fieldInfo = fieldInfo 106 h.nextPerField = nextPerField 107 byteStarts := newPostingsBytesStartArray(h, h.bytesUsed) 108 h.bytesHash = util.NewBytesRefHash(termsHashImpl.termBytePool, HASH_INIT_SIZE, byteStarts) 109 } 110 111 func (h *TermsHashPerFieldImpl) next() TermsHashPerField { 112 return h.nextPerField 113 } 114 115 func (h *TermsHashPerFieldImpl) reset() { 116 h.bytesHash.Clear(false) 117 if h.nextPerField != nil { 118 h.nextPerField.reset() 119 } 120 } 121 122 // func (h *TermsHashPerField) abort() { 123 // h.reset() 124 // if h.nextPerField != nil { 125 // h.nextPerField.abort() 126 // } 127 // } 128 129 func (h *TermsHashPerFieldImpl) initReader(reader *ByteSliceReader, termId, stream int) { 130 assert(stream < h.streamCount) 131 intStart := h.postingsArray.intStarts[termId] 132 ints := h.intPool.Buffers[intStart>>util.INT_BLOCK_SHIFT] 133 upto := intStart & util.INT_BLOCK_MASK 134 reader.init(h.bytePool, 135 h.postingsArray.byteStarts[termId]+stream*util.FIRST_LEVEL_SIZE, 136 ints[upto+stream]) 137 } 138 139 /* Collapse the hash table & sort in-place; also sets sortedTermIDs to the results */ 140 func (h *TermsHashPerFieldImpl) sortPostings(termComp func(a, b []byte) bool) []int { 141 return h.bytesHash.Sort(termComp) 142 } 143 144 // func (h *TermsHashPerField) startField(f IndexableField) { 145 // h.termAtt = h.fieldState.attributeSource.Get("TermToBytesRefAttribute").(ta.TermToBytesRefAttribute) 146 // h.termBytesRef = h.termAtt.BytesRef() 147 // assert(h.termBytesRef != nil) 148 // h.consumer.startField(f) 149 // if h.nextPerField != nil { 150 // h.nextPerField.startField(f) 151 // } 152 // } 153 154 // func (h *TermsHashPerField) start(fields []IndexableField, count int) (bool, error) { 155 // var err error 156 // h.doCall, err = h.consumer.start(fields, count) 157 // if err != nil { 158 // return false, err 159 // } 160 // h.bytesHash.Reinit() 161 // if h.nextPerField != nil { 162 // h.doNextCall, err = h.nextPerField.start(fields, count) 163 // if err != nil { 164 // return false, err 165 // } 166 // } 167 // return h.doCall || h.doNextCall, nil 168 // } 169 170 /* 171 Secondary entry point (for 2nd & subsequent TermsHash), because token 172 text has already be "interned" into textStart, so we hash by textStart 173 */ 174 func (h *TermsHashPerFieldImpl) addFrom(textStart int) error { 175 panic("not implemented yet") 176 } 177 178 // Simpler version of Lucene's own method 179 func utf8ToString(iso8859_1_buf []byte) string { 180 buf := make([]rune, len(iso8859_1_buf)) 181 for i, b := range iso8859_1_buf { 182 buf[i] = rune(b) 183 } 184 return string(buf) 185 } 186 187 /* 188 Called once per inverted token. This is the primary entry point (for 189 first TermsHash); postings use this API. 190 */ 191 func (h *TermsHashPerFieldImpl) add() (err error) { 192 h.termAtt.FillBytesRef() 193 194 // We are first in the chain so we must "intern" the term text into 195 // textStart address. Get the text & hash of this term. 196 var termId int 197 if termId, err = h.bytesHash.Add(h.termBytesRef.ToBytes()); err != nil { 198 return 199 } 200 201 // fmt.Printf("add term=%v doc=%v termId=%v\n", 202 // string(h.termBytesRef.Value), h.docState.docID, termId) 203 204 if termId >= 0 { // new posting 205 h.bytesHash.ByteStart(termId) 206 // init stream slices 207 if h.numPostingInt+h.intPool.IntUpto > util.INT_BLOCK_SIZE { 208 h.intPool.NextBuffer() 209 } 210 211 if util.BYTE_BLOCK_SIZE-h.bytePool.ByteUpto < h.numPostingInt*util.FIRST_LEVEL_SIZE { 212 h.bytePool.NextBuffer() 213 } 214 215 h.intUptos = h.intPool.Buffer 216 h.intUptoStart = h.intPool.IntUpto 217 h.intPool.IntUpto += h.streamCount 218 219 h.postingsArray.intStarts[termId] = h.intUptoStart + h.intPool.IntOffset 220 221 for i := 0; i < h.streamCount; i++ { 222 upto := h.bytePool.NewSlice(util.FIRST_LEVEL_SIZE) 223 h.intUptos[h.intUptoStart+i] = upto + h.bytePool.ByteOffset 224 } 225 h.postingsArray.byteStarts[termId] = h.intUptos[h.intUptoStart] 226 227 h.spi.newTerm(termId) 228 229 } else { 230 termId = (-termId) - 1 231 intStart := h.postingsArray.intStarts[termId] 232 h.intUptos = h.intPool.Buffers[intStart>>util.INT_BLOCK_SHIFT] 233 h.intUptoStart = intStart & util.INT_BLOCK_MASK 234 h.spi.addTerm(termId) 235 } 236 237 if h.doNextCall { 238 return h.nextPerField.addFrom(h.postingsArray.textStarts[termId]) 239 } 240 return nil 241 } 242 243 func (h *TermsHashPerFieldImpl) writeByte(stream int, b byte) { 244 upto := h.intUptos[h.intUptoStart+stream] 245 bytes := h.bytePool.Buffers[upto>>util.BYTE_BLOCK_SHIFT] 246 assert(bytes != nil) 247 offset := upto & util.BYTE_BLOCK_MASK 248 if bytes[offset] != 0 { 249 // end of slice; allocate a new one 250 offset = h.bytePool.AllocSlice(bytes, offset) 251 bytes = h.bytePool.Buffer 252 h.intUptos[h.intUptoStart+stream] = offset + h.bytePool.ByteOffset 253 } 254 bytes[offset] = b 255 h.intUptos[h.intUptoStart+stream]++ 256 } 257 258 func (h *TermsHashPerFieldImpl) writeVInt(stream, i int) { 259 assert(stream < h.streamCount) 260 for (i & ^0x7F) != 0 { 261 h.writeByte(stream, byte((i&0x7F)|0x80)) 262 i = int(uint(i) >> 7) 263 } 264 h.writeByte(stream, byte(i)) 265 } 266 267 func (h *TermsHashPerFieldImpl) finish() error { 268 if h.nextPerField != nil { 269 return h.nextPerField.finish() 270 } 271 return nil 272 } 273 274 /* 275 Start adding a new field instance; first is true if this is the first 276 time this field name was seen in the document. 277 */ 278 func (h *TermsHashPerFieldImpl) start(field IndexableField, first bool) bool { 279 if h.termAtt = h.fieldState.termAttribute; h.termAtt != nil { 280 // EmptyTokenStream can have nil term att 281 h.termBytesRef = h.termAtt.BytesRef() 282 } 283 if h.nextPerField != nil { 284 h.doNextCall = h.nextPerField.start(field, first) 285 } 286 return true 287 } 288 289 type PostingsBytesStartArray struct { 290 perField *TermsHashPerFieldImpl 291 bytesUsed util.Counter 292 } 293 294 func newPostingsBytesStartArray(perField *TermsHashPerFieldImpl, 295 bytesUsed util.Counter) *PostingsBytesStartArray { 296 return &PostingsBytesStartArray{perField, bytesUsed} 297 } 298 299 func (ss *PostingsBytesStartArray) Init() []int { 300 if ss.perField.postingsArray == nil { 301 arr := ss.perField.spi.createPostingsArray(2) 302 ss.perField.postingsArray = arr 303 ss.perField.spi.newPostingsArray() 304 ss.bytesUsed.AddAndGet(int64(arr.size * arr.bytesPerPosting())) 305 } 306 return ss.perField.postingsArray.textStarts 307 } 308 309 func (ss *PostingsBytesStartArray) Grow() []int { 310 postingsArray := ss.perField.postingsArray 311 oldSize := postingsArray.size 312 postingsArray = postingsArray.grow() 313 ss.perField.postingsArray = postingsArray 314 ss.perField.spi.newPostingsArray() 315 ss.bytesUsed.AddAndGet(int64(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize))) 316 return postingsArray.textStarts 317 } 318 319 func (ss *PostingsBytesStartArray) Clear() []int { 320 if arr := ss.perField.postingsArray; arr != nil { 321 ss.bytesUsed.AddAndGet(-int64(arr.size * arr.bytesPerPosting())) 322 ss.perField.postingsArray = nil 323 ss.perField.spi.newPostingsArray() 324 } 325 return nil 326 } 327 328 func (ss *PostingsBytesStartArray) BytesUsed() util.Counter { 329 return ss.bytesUsed 330 } 331 332 // index/ParallelPostingsArray.java 333 334 const BYTES_PER_POSTING = 3 * util.NUM_BYTES_INT 335 336 type PostingsArray interface { 337 bytesPerPosting() int 338 newInstance(size int) PostingsArray 339 copyTo(toArray PostingsArray, numToCopy int) 340 } 341 342 type ParallelPostingsArray struct { 343 PostingsArray 344 size int 345 textStarts []int 346 intStarts []int 347 byteStarts []int 348 } 349 350 func newParallelPostingsArray(spi PostingsArray, size int) *ParallelPostingsArray { 351 return &ParallelPostingsArray{ 352 PostingsArray: spi, 353 size: size, 354 textStarts: make([]int, size), 355 intStarts: make([]int, size), 356 byteStarts: make([]int, size), 357 } 358 } 359 360 func (arr *ParallelPostingsArray) bytesPerPosting() int { 361 return BYTES_PER_POSTING 362 } 363 364 func (arr *ParallelPostingsArray) newInstance(size int) PostingsArray { // *ParallelPostingsArray 365 ans := newParallelPostingsArray(nil, size) 366 ans.PostingsArray = ans 367 return ans 368 } 369 370 func (arr *ParallelPostingsArray) grow() *ParallelPostingsArray { 371 newSize := util.Oversize(arr.size+1, arr.PostingsArray.bytesPerPosting()) 372 newArray := arr.PostingsArray.newInstance(newSize) 373 arr.PostingsArray.copyTo(newArray, arr.size) 374 return newArray.(*ParallelPostingsArray) 375 } 376 377 func (arr *ParallelPostingsArray) copyTo(toArray PostingsArray, numToCopy int) { 378 to := toArray.(*ParallelPostingsArray) 379 copy(to.textStarts[:numToCopy], arr.textStarts[:numToCopy]) 380 copy(to.intStarts[:numToCopy], arr.intStarts[:numToCopy]) 381 copy(to.byteStarts[:numToCopy], arr.byteStarts[:numToCopy]) 382 }