github.com/balzaczyy/golucene@v0.0.0-20151210033525-d0be9ee89713/core/codec/lucene41/postingsReader.go (about) 1 package lucene41 2 3 import ( 4 "fmt" 5 "github.com/balzaczyy/golucene/core/codec" 6 . "github.com/balzaczyy/golucene/core/codec/spi" 7 . "github.com/balzaczyy/golucene/core/index/model" 8 . "github.com/balzaczyy/golucene/core/search/model" 9 "github.com/balzaczyy/golucene/core/store" 10 "github.com/balzaczyy/golucene/core/util" 11 ) 12 13 // Lucene41PostingsReader.java 14 15 /* 16 Concrete class that reads docId (maybe frq,pos,offset,payload) list 17 with postings format. 18 */ 19 type Lucene41PostingsReader struct { 20 docIn store.IndexInput 21 posIn store.IndexInput 22 payIn store.IndexInput 23 forUtil *ForUtil 24 version int 25 } 26 27 func NewLucene41PostingsReader(dir store.Directory, 28 fis FieldInfos, si *SegmentInfo, 29 ctx store.IOContext, segmentSuffix string) (r PostingsReaderBase, err error) { 30 31 // fmt.Println("Initializing Lucene41PostingsReader...") 32 success := false 33 var docIn, posIn, payIn store.IndexInput = nil, nil, nil 34 defer func() { 35 if !success { 36 fmt.Println("Failed to initialize Lucene41PostingsReader.") 37 util.CloseWhileSuppressingError(docIn, posIn, payIn) 38 } 39 }() 40 41 docIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_DOC_EXTENSION), ctx) 42 if err != nil { 43 return nil, err 44 } 45 var version int32 46 version, err = codec.CheckHeader(docIn, LUCENE41_DOC_CODEC, LUCENE41_VERSION_START, LUCENE41_VERSION_CURRENT) 47 if err != nil { 48 return nil, err 49 } 50 forUtil, err := NewForUtilFrom(docIn) 51 if err != nil { 52 return nil, err 53 } 54 55 if version >= LUCENE41_VERSION_CHECKSUM { 56 // NOTE: data file is too costly to verify checksum against all the 57 // bytes on open, but for now we at least verify proper structure 58 // of the checksum footer: which looks for FOOTER_MAGIC + 59 // algorithmID. This is cheap and can detect some forms of 60 // corruption such as file trucation. 61 if _, err = codec.RetrieveChecksum(docIn); err != nil { 62 return nil, err 63 } 64 } 65 66 if fis.HasProx { 67 posIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_POS_EXTENSION), ctx) 68 if err != nil { 69 return nil, err 70 } 71 _, err = codec.CheckHeader(posIn, LUCENE41_POS_CODEC, version, version) 72 if err != nil { 73 return nil, err 74 } 75 76 if version >= LUCENE41_VERSION_CHECKSUM { 77 // NOTE: data file is too costly to verify checksum against all the 78 // bytes on open, but for now we at least verify proper structure 79 // of the checksum footer: which looks for FOOTER_MAGIC + 80 // algorithmID. This is cheap and can detect some forms of 81 // corruption such as file trucation. 82 if _, err = codec.RetrieveChecksum(posIn); err != nil { 83 return nil, err 84 } 85 } 86 87 if fis.HasPayloads || fis.HasOffsets { 88 payIn, err = dir.OpenInput(util.SegmentFileName(si.Name, segmentSuffix, LUCENE41_PAY_EXTENSION), ctx) 89 if err != nil { 90 return nil, err 91 } 92 _, err = codec.CheckHeader(payIn, LUCENE41_PAY_CODEC, version, version) 93 if err != nil { 94 return nil, err 95 } 96 97 if version >= LUCENE41_VERSION_CHECKSUM { 98 // NOTE: data file is too costly to verify checksum against all the 99 // bytes on open, but for now we at least verify proper structure 100 // of the checksum footer: which looks for FOOTER_MAGIC + 101 // algorithmID. This is cheap and can detect some forms of 102 // corruption such as file trucation. 103 if _, err = codec.RetrieveChecksum(payIn); err != nil { 104 return nil, err 105 } 106 107 } 108 } 109 } 110 111 success = true 112 return &Lucene41PostingsReader{docIn, posIn, payIn, forUtil, int(version)}, nil 113 } 114 115 func (r *Lucene41PostingsReader) Init(termsIn store.IndexInput) error { 116 // fmt.Println("Initializing from:", termsIn) 117 // Make sure we are talking to the matching postings writer 118 _, err := codec.CheckHeader(termsIn, LUCENE41_TERMS_CODEC, LUCENE41_VERSION_START, LUCENE41_VERSION_CURRENT) 119 if err != nil { 120 return err 121 } 122 indexBlockSize, err := termsIn.ReadVInt() 123 if err != nil { 124 return err 125 } 126 // fmt.Println("Index block size:", indexBlockSize) 127 if indexBlockSize != LUCENE41_BLOCK_SIZE { 128 panic(fmt.Sprintf("index-time BLOCK_SIZE (%v) != read-time BLOCK_SIZE (%v)", indexBlockSize, LUCENE41_BLOCK_SIZE)) 129 } 130 return nil 131 } 132 133 /** 134 * Read values that have been written using variable-length encoding instead of bit-packing. 135 */ 136 func readVIntBlock(docIn store.IndexInput, docBuffer []int, 137 freqBuffer []int, num int, indexHasFreq bool) (err error) { 138 if indexHasFreq { 139 for i := 0; i < num; i++ { 140 code, err := asInt(docIn.ReadVInt()) 141 if err != nil { 142 return err 143 } 144 docBuffer[i] = int(uint(code) >> 1) 145 if (code & 1) != 0 { 146 freqBuffer[i] = 1 147 } else { 148 freqBuffer[i], err = asInt(docIn.ReadVInt()) 149 if err != nil { 150 return err 151 } 152 } 153 } 154 } else { 155 for i := 0; i < num; i++ { 156 docBuffer[i], err = asInt(docIn.ReadVInt()) 157 if err != nil { 158 return err 159 } 160 } 161 } 162 return nil 163 } 164 165 func asInt(n int32, err error) (int, error) { 166 return int(n), err 167 } 168 169 func (r *Lucene41PostingsReader) NewTermState() *BlockTermState { 170 return newIntBlockTermState().BlockTermState 171 } 172 173 func (r *Lucene41PostingsReader) Close() error { 174 return util.Close(r.docIn, r.posIn, r.payIn) 175 } 176 177 func (r *Lucene41PostingsReader) DecodeTerm(longs []int64, 178 in util.DataInput, fieldInfo *FieldInfo, 179 _termState *BlockTermState, absolute bool) (err error) { 180 181 termState := _termState.Self.(*intBlockTermState) 182 fieldHasPositions := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS 183 fieldHasOffsets := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS 184 fieldHasPayloads := fieldInfo.HasPayloads() 185 186 if absolute { 187 termState.docStartFP = 0 188 termState.posStartFP = 0 189 termState.payStartFP = 0 190 } 191 if r.version < LUCENE41_VERSION_META_ARRAY { // backward compatibility 192 return r._decodeTerm(in, fieldInfo, termState) 193 } 194 termState.docStartFP += longs[0] 195 if fieldHasPositions { 196 termState.posStartFP += longs[1] 197 if fieldHasOffsets || fieldHasPayloads { 198 termState.payStartFP += longs[2] 199 } 200 } 201 if termState.DocFreq == 1 { 202 if termState.singletonDocID, err = asInt(in.ReadVInt()); err != nil { 203 return 204 } 205 } else { 206 termState.singletonDocID = -1 207 } 208 if fieldHasPositions { 209 if termState.TotalTermFreq > LUCENE41_BLOCK_SIZE { 210 if termState.lastPosBlockOffset, err = in.ReadVLong(); err != nil { 211 return err 212 } 213 } else { 214 termState.lastPosBlockOffset = -1 215 } 216 } 217 if termState.DocFreq > LUCENE41_BLOCK_SIZE { 218 if termState.skipOffset, err = in.ReadVLong(); err != nil { 219 return 220 } 221 } else { 222 termState.skipOffset = -1 223 } 224 return nil 225 } 226 227 func (r *Lucene41PostingsReader) _decodeTerm(in util.DataInput, 228 fieldInfo *FieldInfo, termState *intBlockTermState) (err error) { 229 230 fieldHasPositions := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS 231 fieldHasOffsets := fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS 232 fieldHasPaylods := fieldInfo.HasPayloads() 233 if termState.DocFreq == 1 { 234 if termState.singletonDocID, err = asInt(in.ReadVInt()); err != nil { 235 return 236 } 237 } else { 238 termState.singletonDocID = -1 239 var n int64 240 if n, err = in.ReadVLong(); err != nil { 241 return 242 } 243 termState.docStartFP += n 244 } 245 if fieldHasPositions { 246 var n int64 247 if n, err = in.ReadVLong(); err != nil { 248 return 249 } 250 termState.posStartFP += n 251 if termState.TotalTermFreq > LUCENE41_BLOCK_SIZE { 252 if n, err = in.ReadVLong(); err != nil { 253 return 254 } 255 termState.lastPosBlockOffset += n 256 } else { 257 termState.lastPosBlockOffset = -1 258 } 259 if (fieldHasPaylods || fieldHasOffsets) && termState.TotalTermFreq >= LUCENE41_BLOCK_SIZE { 260 if n, err = in.ReadVLong(); err != nil { 261 return 262 } 263 termState.payStartFP += n 264 } 265 } 266 if termState.DocFreq > LUCENE41_BLOCK_SIZE { 267 if termState.skipOffset, err = in.ReadVLong(); err != nil { 268 return 269 } 270 } else { 271 termState.skipOffset = -1 272 } 273 return nil 274 } 275 276 func (r *Lucene41PostingsReader) Docs(fieldInfo *FieldInfo, 277 termState *BlockTermState, liveDocs util.Bits, 278 reuse DocsEnum, flags int) (de DocsEnum, err error) { 279 280 var docsEnum *blockDocsEnum 281 if v, ok := reuse.(*blockDocsEnum); ok { 282 docsEnum = v 283 if !docsEnum.canReuse(r.docIn, fieldInfo) { 284 docsEnum = newBlockDocsEnum(r, fieldInfo) 285 } 286 } else { 287 docsEnum = newBlockDocsEnum(r, fieldInfo) 288 } 289 return docsEnum.reset(liveDocs, termState.Self.(*intBlockTermState), flags) 290 } 291 292 type blockDocsEnum struct { 293 *Lucene41PostingsReader // embedded struct 294 295 encoded []byte 296 297 docDeltaBuffer []int 298 freqBuffer []int 299 300 docBufferUpto int 301 302 // skipper Lucene41SkipReader 303 skipped bool 304 305 startDocIn store.IndexInput 306 307 docIn store.IndexInput 308 indexHasFreq bool 309 indexHasPos bool 310 indexHasOffsets bool 311 indexHasPayloads bool 312 313 docFreq int 314 totalTermFreq int64 315 docUpto int 316 doc int 317 accum int 318 freq int 319 320 // Where this term's postings start in the .doc file: 321 docTermStartFP int64 322 323 // Where this term's skip data starts (after 324 // docTermStartFP) in the .doc file (or -1 if there is 325 // no skip data for this term): 326 skipOffset int64 327 328 // docID for next skip point, we won't use skipper if 329 // target docID is not larger than this 330 nextSkipDoc int 331 332 liveDocs util.Bits 333 334 needsFreq bool 335 singletonDocID int 336 } 337 338 func newBlockDocsEnum(owner *Lucene41PostingsReader, 339 fieldInfo *FieldInfo) *blockDocsEnum { 340 341 return &blockDocsEnum{ 342 Lucene41PostingsReader: owner, 343 docDeltaBuffer: make([]int, MAX_DATA_SIZE), 344 freqBuffer: make([]int, MAX_DATA_SIZE), 345 startDocIn: owner.docIn, 346 docIn: nil, 347 indexHasFreq: fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS, 348 indexHasPos: fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS, 349 indexHasOffsets: fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS, 350 indexHasPayloads: fieldInfo.HasPayloads(), 351 encoded: make([]byte, MAX_ENCODED_SIZE), 352 } 353 } 354 355 func (de *blockDocsEnum) canReuse(docIn store.IndexInput, fieldInfo *FieldInfo) bool { 356 return docIn == de.startDocIn && 357 de.indexHasFreq == (fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS) && 358 de.indexHasPos == (fieldInfo.IndexOptions() >= INDEX_OPT_DOCS_AND_FREQS_AND_POSITIONS) && 359 de.indexHasPayloads == fieldInfo.HasPayloads() 360 } 361 362 func (de *blockDocsEnum) reset(liveDocs util.Bits, termState *intBlockTermState, flags int) (ret DocsEnum, err error) { 363 de.liveDocs = liveDocs 364 // fmt.Println(" FPR.reset: termState=", termState) 365 de.docFreq = termState.DocFreq 366 if de.indexHasFreq { 367 de.totalTermFreq = termState.TotalTermFreq 368 } else { 369 de.totalTermFreq = int64(de.docFreq) 370 } 371 de.docTermStartFP = termState.docStartFP // <---- docTermStartFP should be 178 instead of 0 372 de.skipOffset = termState.skipOffset 373 de.singletonDocID = termState.singletonDocID 374 if de.docFreq > 1 { 375 if de.docIn == nil { 376 // lazy init 377 de.docIn = de.startDocIn.Clone() 378 } 379 err = de.docIn.Seek(de.docTermStartFP) 380 if err != nil { 381 return nil, err 382 } 383 } 384 385 de.doc = -1 386 de.needsFreq = (flags & DOCS_ENUM_FLAG_FREQS) != 0 387 if !de.indexHasFreq { 388 for i, _ := range de.freqBuffer { 389 de.freqBuffer[i] = 1 390 } 391 } 392 de.accum = 0 393 de.docUpto = 0 394 de.nextSkipDoc = LUCENE41_BLOCK_SIZE - 1 // we won't skip if target is found in first block 395 de.docBufferUpto = LUCENE41_BLOCK_SIZE 396 de.skipped = false 397 return de, nil 398 } 399 400 func (de *blockDocsEnum) Freq() (n int, err error) { 401 return de.freq, nil 402 } 403 404 func (de *blockDocsEnum) DocId() int { 405 return de.doc 406 } 407 408 func (de *blockDocsEnum) refillDocs() (err error) { 409 left := de.docFreq - de.docUpto 410 assert(left > 0) 411 412 if left >= LUCENE41_BLOCK_SIZE { 413 fmt.Println(" fill doc block from fp=", de.docIn.FilePointer()) 414 panic("not implemented yet") 415 } else if de.docFreq == 1 { 416 de.docDeltaBuffer[0] = de.singletonDocID 417 de.freqBuffer[0] = int(de.totalTermFreq) 418 } else { 419 // Read vInts: 420 // fmt.Println(" fill last vInt block from fp=", de.docIn.FilePointer()) 421 err = readVIntBlock(de.docIn, de.docDeltaBuffer, de.freqBuffer, left, de.indexHasFreq) 422 } 423 de.docBufferUpto = 0 424 return 425 } 426 427 func (de *blockDocsEnum) NextDoc() (n int, err error) { 428 // fmt.Println("FPR.nextDoc") 429 for { 430 // fmt.Printf(" docUpto=%v (of df=%v) docBufferUpto=%v\n", de.docUpto, de.docFreq, de.docBufferUpto) 431 432 if de.docUpto == de.docFreq { 433 // fmt.Println(" return doc=END") 434 de.doc = NO_MORE_DOCS 435 return de.doc, nil 436 } 437 438 if de.docBufferUpto == LUCENE41_BLOCK_SIZE { 439 err = de.refillDocs() 440 if err != nil { 441 return 0, err 442 } 443 } 444 445 // fmt.Printf(" accum=%v docDeltaBuffer[%v]=%v\n", de.accum, de.docBufferUpto, de.docDeltaBuffer[de.docBufferUpto]) 446 de.accum += de.docDeltaBuffer[de.docBufferUpto] 447 de.docUpto++ 448 449 if de.liveDocs == nil || de.liveDocs.At(de.accum) { 450 de.doc = de.accum 451 de.freq = de.freqBuffer[de.docBufferUpto] 452 de.docBufferUpto++ 453 // fmt.Printf(" return doc=%v freq=%v\n", de.doc, de.freq) 454 return de.doc, nil 455 } 456 // fmt.Printf(" doc=%v is deleted; try next doc\n", de.accum) 457 de.docBufferUpto++ 458 } 459 } 460 461 func (de *blockDocsEnum) Advance(target int) (int, error) { 462 // TODO: make frq block load lazy/skippable 463 fmt.Printf(" FPR.advance target=%v\n", target) 464 465 // current skip docID < docIDs generated from current buffer <= next 466 // skip docID, we don't need to skip if target is buffered already 467 if de.docFreq > LUCENE41_BLOCK_SIZE && target > de.nextSkipDoc { 468 fmt.Println("load skipper") 469 470 panic("not implemented yet") 471 } 472 if de.docUpto == de.docFreq { 473 de.doc = NO_MORE_DOCS 474 return de.doc, nil 475 } 476 if de.docBufferUpto == LUCENE41_BLOCK_SIZE { 477 err := de.refillDocs() 478 if err != nil { 479 return 0, nil 480 } 481 } 482 483 // Now scan.. this is an inlined/pared down version of nextDoc(): 484 for { 485 fmt.Printf(" scan doc=%v docBufferUpto=%v\n", de.accum, de.docBufferUpto) 486 de.accum += de.docDeltaBuffer[de.docBufferUpto] 487 de.docUpto++ 488 489 if de.accum >= target { 490 break 491 } 492 de.docBufferUpto++ 493 if de.docUpto == de.docFreq { 494 de.doc = NO_MORE_DOCS 495 return de.doc, nil 496 } 497 } 498 499 if de.liveDocs == nil || de.liveDocs.At(de.accum) { 500 fmt.Printf(" return doc=%v\n", de.accum) 501 de.freq = de.freqBuffer[de.docBufferUpto] 502 de.docBufferUpto++ 503 de.doc = de.accum 504 return de.doc, nil 505 } else { 506 fmt.Println(" now do nextDoc()") 507 de.docBufferUpto++ 508 return de.NextDoc() 509 } 510 }